apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: data
spec:
  resources:
    requests:
      storage: 64Gi
  accessModes:
    - ReadWriteMany
  storageClassName: wm2-nfs

---

apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
  name: hpcgame-test-fire # Replace with your jobname
spec:
  network:
    enableDNSHostnames: true
    subdomain: hpcgame-test-fire
  replicatedJobs:
    - name: workers
      replicas: 1 # Should distributed to 4 nodes
      template: # Describe a job replica
        spec:
          backoffLimit: 0
          completions: 4 # 4 pods
          parallelism: 4 # 4 pods
          template: # Describe a pod
            metadata:
              annotations:
                ssh-operator.lcpu.dev/inject: enabled # turn to `enabled` if you want to use SSH.
                lxcfs.lcpu.dev/inject: disabled # Turn to `enabled` if you want to use LxcFs.
                k8s.v1.cni.cncf.io/networks: default/wm2-roce # Enable MPI
            spec:
              nodeSelector:
                # Available pre-defined nodes:
                # x86, x86_amd, arm, gpu, npu, npu_inf
                hpc.lcpu.dev/partition: x86

              # Uncomment these to force to run on different nodes
              # ------------ BEGIN ------------
              affinity:
                podAntiAffinity:
                  requiredDuringSchedulingIgnoredDuringExecution:
                    - topologyKey: "kubernetes.io/hostname"
                      labelSelector:
                        matchExpressions:
                          - key: jobset.x-k8s.io/name
                            operator: In
                            values:
                              - hpcgame-test-fire
              # ------------ END ------------

              containers: # Describe a container inside a pod
                - name: worker
                  securityContext:
                    capabilities:
                      add: ["IPC_LOCK"] # For MPI
                  # `fire` requires Intel environment
                  image: crmirror.lcpu.dev/hpcgame/intel:latest
                  command:
                    - sleep
                    - inf
                  resources:
                    limits:
                      cpu: 16
                      memory: 48Gi
                      rdma.hpc.lcpu.dev/hca_cx5: 1
                    requests:
                      cpu: 16
                      memory: 48Gi
                      rdma.hpc.lcpu.dev/hca_cx5: 1
                  volumeMounts:
                    - name: shared-data # The name of volume defined in `volumes` section, see below
                      mountPath: /data
              volumes:
                - name: shared-data
                  persistentVolumeClaim: # From which PVC does this volume come from?
                    claimName: data