Skip to content

Commit

Permalink
Add multiple GPU support #760 (#924)
Browse files Browse the repository at this point in the history
  • Loading branch information
splhack authored Jun 20, 2021
1 parent b10cd0d commit c22fe12
Show file tree
Hide file tree
Showing 134 changed files with 4,872 additions and 711 deletions.
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.12
0.13
4 changes: 3 additions & 1 deletion cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ public class DispatchFrame extends FrameEntity implements FrameInterface {
public int maxCores;
public boolean threadable;
public long minMemory;
public long minGpu;
public int minGpus;
public int maxGpus;
public long minGpuMemory;

public String services;
}
Expand Down
39 changes: 26 additions & 13 deletions cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ public class DispatchHost extends Entity
public int cores;
public int idleCores;

public int gpus;
public int idleGpus;

// Basically an 0 = auto, 1 = all.
public int threadMode;

public long memory;
public long idleMemory;
public long gpu;
public long idleGpu;
public long gpuMemory;
public long idleGpuMemory;
public String tags;
public String os;

Expand All @@ -53,11 +56,13 @@ public class DispatchHost extends Entity
* booked to this host.
*/
public int strandedCores = 0;
public int strandedGpus = 0;

// To reserve resources for future gpu job
long idleMemoryOrig = 0;
int idleCoresOrig = 0;
long idleGpuOrig = 0;
long idleGpuMemoryOrig = 0;
int idleGpusOrig = 0;

public String getHostId() {
return id;
Expand All @@ -72,41 +77,47 @@ public String getFacilityId() {
}

@Override
public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) {
public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) {

if (idleCores < minCores) {
return false;
}
else if (idleMemory < minMemory) {
return false;
}
else if (idleGpu < minGpu) {
else if (idleGpus < minGpus) {
return false;
}
else if (idleGpuMemory < minGpuMemory) {
return false;
}

return true;
}

@Override
public void useResources(int coreUnits, long memory, long gpu) {
public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) {
idleCores = idleCores - coreUnits;
idleMemory = idleMemory - memory;
idleGpu = idleGpu - gpu;
idleGpus = idleGpus - gpuUnits;
idleGpuMemory = idleGpuMemory - gpuMemory;
}

/**
* If host has idle gpu, remove enough resources to book a gpu frame later.
*
*/
public void removeGpu() {
if (idleGpu > 0 && idleGpuOrig == 0) {
if (idleGpuMemory > 0 && idleGpuMemoryOrig == 0) {
idleMemoryOrig = idleMemory;
idleCoresOrig = idleCores;
idleGpuOrig = idleGpu;
idleGpuMemoryOrig = idleGpuMemory;
idleGpusOrig = idleGpus;

idleMemory = idleMemory - Math.min(CueUtil.GB4, idleMemory);
idleCores = idleCores - Math.min(100, idleCores);
idleGpu = 0;
idleGpuMemory = idleGpuMemory - Math.min(CueUtil.GB4, idleGpuMemory);
idleGpus = idleGpus - Math.min(1, idleGpus);
}
}

Expand All @@ -115,14 +126,16 @@ public void removeGpu() {
*
*/
public void restoreGpu() {
if (idleGpuOrig > 0) {
if (idleGpuMemoryOrig > 0) {
idleMemory = idleMemoryOrig;
idleCores = idleCoresOrig;
idleGpu = idleGpuOrig;
idleGpuMemory = idleGpuMemoryOrig;
idleGpus = idleGpusOrig;

idleMemoryOrig = 0;
idleCoresOrig = 0;
idleGpuOrig = 0;
idleGpuMemoryOrig = 0;
idleGpusOrig = 0;
}
}
}
Expand Down
27 changes: 27 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ public class ExecutionSummary {
public long coreTime;
public long coreTimeSuccess;
public long coreTimeFail;
public long gpuTime;
public long gpuTimeSuccess;
public long gpuTimeFail;
public long highMemoryKb;

public long getHighMemoryKb() {
Expand Down Expand Up @@ -69,5 +72,29 @@ public long getCoreTimeFail() {
public void setCoreTimeFail(long coreTimeFail) {
this.coreTimeFail = coreTimeFail;
}

public long getGpuTime() {
return gpuTime;
}

public void setGpuTime(long gpuTime) {
this.gpuTime = gpuTime;
}

public long getGpuTimeSuccess() {
return gpuTimeSuccess;
}

public void setGpuTimeSuccess(long gpuTimeSuccess) {
this.gpuTimeSuccess = gpuTimeSuccess;
}

public long getGpuTimeFail() {
return gpuTimeFail;
}

public void setGpuTimeFail(long gpuTimeFail) {
this.gpuTimeFail = gpuTimeFail;
}
}

5 changes: 5 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,16 @@ public class GroupDetail extends Entity implements GroupInterface, DepartmentInt

public int jobMinCores = -1;
public int jobMaxCores = -1;
public int jobMinGpus = -1;
public int jobMaxGpus = -1;
public int jobPriority = -1;

public int minCores = -1;
public int maxCores = -1;

public int minGpus = -1;
public int maxGpus = -1;

public String parentId = null;
public String showId;
public String deptId;
Expand Down
20 changes: 12 additions & 8 deletions cuebot/src/main/java/com/imageworks/spcue/HostEntity.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ public class HostEntity extends Entity implements HostInterface {
public int procs;
public int cores;
public int idleCores;
public int memory;
public int idleMemory;
public int gpu;
public int idleGpu;
public long memory;
public long idleMemory;
public int gpus;
public int idleGpus;
public long gpuMemory;
public long idleGpuMemory;

public boolean unlockAtBoot;

Expand All @@ -57,10 +59,12 @@ public HostEntity(Host grpcHost) {
this.nimbyEnabled = grpcHost.getNimbyEnabled();
this.cores = (int) grpcHost.getCores();
this.idleCores = (int) grpcHost.getIdleCores();
this.memory = (int) grpcHost.getMemory();
this.idleMemory = (int) grpcHost.getIdleMemory();
this.gpu = (int) grpcHost.getGpu();
this.idleGpu = (int) grpcHost.getIdleGpu();
this.memory = grpcHost.getMemory();
this.idleMemory = grpcHost.getIdleMemory();
this.gpus = (int) grpcHost.getGpus();
this.idleGpus = (int) grpcHost.getIdleGpus();
this.gpuMemory = grpcHost.getGpuMemory();
this.idleGpuMemory = grpcHost.getIdleGpuMemory();
}

public String getHostId() {
Expand Down
2 changes: 2 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/Inherit.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public enum Inherit {
Priority,
MinCores,
MaxCores,
MinGpus,
MaxGpus,
All
}

7 changes: 5 additions & 2 deletions cuebot/src/main/java/com/imageworks/spcue/JobDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ public class JobDetail extends JobEntity implements JobInterface, DepartmentInte
public int priority = 1;
public int minCoreUnits = 100;
public int maxCoreUnits = 200000;
public int minGpuUnits = 0;
public int maxGpuUnits = 1000;
public boolean isLocal = false;
public String localHostName;
public int localMaxCores;
public int localMaxMemory;
public long localMaxMemory;
public int localThreadNumber;
public int localMaxGpu;
public int localMaxGpus;
public long localMaxGpuMemory;

public String getDepartmentId() {
return deptId;
Expand Down
20 changes: 15 additions & 5 deletions cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ public class LayerDetail extends LayerEntity implements LayerInterface {
public LayerType type;
public int minimumCores;
public int maximumCores;
public int minimumGpus;
public int maximumGpus;
public boolean isThreadable;
public long minimumMemory;
public long minimumGpu;
public long minimumGpuMemory;
public int chunkSize;
public int timeout;
public int timeout_llu;
Expand Down Expand Up @@ -116,12 +118,20 @@ public void setMinimumMemory(long minimumMemory) {
this.minimumMemory = minimumMemory;
}

public long getMinimumGpu() {
return minimumGpu;
public int getMinimumGpus() {
return minimumGpus;
}

public void setMinimumGpu(long minimumGpu) {
this.minimumGpu = minimumGpu;
public void setMinimumGpus(int minimumGpus) {
this.minimumGpus = minimumGpus;
}

public long getMinimumGpuMemory() {
return minimumGpuMemory;
}

public void setMinimumGpuMemory(long minimumGpuMemory) {
this.minimumGpuMemory = minimumGpuMemory;
}

public int getChunkSize() {
Expand Down
55 changes: 39 additions & 16 deletions cuebot/src/main/java/com/imageworks/spcue/LocalHostAssignment.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ public class LocalHostAssignment extends Entity

private int idleCoreUnits;
private long idleMemory;
private long idleGpu;
private int idleGpuUnits;
private long idleGpuMemory;

private long maxMemory;
private long maxGpu;
private long maxGpuMemory;
private int maxCoreUnits;
private int maxGpuUnits;

private int threads;

Expand All @@ -52,34 +54,39 @@ public class LocalHostAssignment extends Entity

public LocalHostAssignment() { }

public LocalHostAssignment(int maxCores, int threads, long maxMemory, long maxGpu) {
public LocalHostAssignment(int maxCores, int threads, long maxMemory, int maxGpus, long maxGpuMemory) {
this.maxCoreUnits = maxCores;
this.threads = threads;
this.maxMemory = maxMemory;
this.maxGpu = maxGpu;
this.maxGpuUnits = maxGpus;
this.maxGpuMemory = maxGpuMemory;
}

@Override
public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) {
public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) {

if (idleCoreUnits < minCores) {
return false;
}
else if (idleMemory < minMemory) {
return false;
}
else if (idleGpu < minGpu) {
else if (idleGpuUnits < minGpus) {
return false;
}
else if (idleGpuMemory < minGpuMemory) {
return false;
}

return true;
}

@Override
public void useResources(int coreUnits, long memory, long gpu) {
public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) {
idleCoreUnits = idleCoreUnits - coreUnits;
idleMemory = idleMemory - memory;
idleGpu = idleGpu - gpu;
idleGpuUnits = idleGpuUnits - gpuUnits;
idleGpuMemory = idleGpuMemory - gpuMemory;
}

public int getThreads() {
Expand Down Expand Up @@ -110,16 +117,24 @@ public long getIdleMemory() {
return this.idleMemory;
}

public long getMaxGpu() {
return maxGpu;
public int getMaxGpuUnits() {
return maxGpuUnits;
}

public void setMaxGpuUnits(int maxGpuUnits) {
this.maxGpuUnits = maxGpuUnits;
}

public long getMaxGpuMemory() {
return maxGpuMemory;
}

public void setMaxGpu(long maxGpu) {
this.maxGpu = maxGpu;
public void setMaxGpuMemory(long maxGpuMemory) {
this.maxGpuMemory = maxGpuMemory;
}

public long getIdleGpu() {
return this.idleGpu;
public long getIdleGpuMemory() {
return this.idleGpuMemory;
}

public int getIdleCoreUnits() {
Expand All @@ -134,8 +149,16 @@ public void setIdleMemory(long idleMemory) {
this.idleMemory = idleMemory;
}

public void setIdleGpu(long idleGpu) {
this.idleGpu = idleGpu;
public int getIdleGpuUnits() {
return this.idleGpuUnits;
}

public void setIdleGpuUnits(int idleGpuUnits) {
this.idleGpuUnits = idleGpuUnits;
}

public void setIdleGpuMemory(long idleGpuMemory) {
this.idleGpuMemory = idleGpuMemory;
}

public String getHostId() {
Expand Down
Loading

0 comments on commit c22fe12

Please sign in to comment.
  翻译: