[arvados] created: 2.1.0-3154-g044654c3f

Tue Dec 6 17:16:54 UTC 2022

at  044654c3fc20e8cd98cbea88f1681ee394347b8c (commit)


commit 044654c3fc20e8cd98cbea88f1681ee394347b8c
Author: Peter Amstutz <peter.amstutz at curii.com>
Date:   Tue Dec 6 12:15:11 2022 -0500

    19847: Add calculation for choosing keep disk cache size.
    
    Add KeepCacheTypeRequirement to control which cache to use, to
    facilitate performance comparison.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz at curii.com>

diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index 550ecba1c..9135ff674 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -280,6 +280,7 @@ def add_arv_hints():
         "http://commonwl.org/cwltool#CUDARequirement",
         "http://arvados.org/cwl#UsePreemptible",
         "http://arvados.org/cwl#OutputCollectionProperties",
+        "http://arvados.org/cwl#KeepCacheTypeRequirement",
     ])
 
 def exit_signal_handler(sigcode, frame):
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
index 54e0fc512..fc370eb81 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -420,3 +420,39 @@ $graph:
       jsonldPredicate:
         mapSubject: propertyName
         mapPredicate: propertyValue
+
+
+- name: KeepCacheType
+  type: enum
+  symbols:
+    - ram_cache
+    - disk_cache
+  doc:
+    - |
+        ram_cache: Keep blocks will be cached in RAM only.
+    - |
+        disk_cache: Keep blocks will be cached to disk and
+        memory-mapped.  The disk cache leverages the kernel's virtual
+        memory system so "hot" data will generally still be kept in
+        RAM.
+
+- name: KeepCacheTypeRequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Choose keep cache strategy.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:KeepCacheTypeRequirement'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: keepCacheType
+      type: KeepCacheType?
+      doc: |
+        Whether Keep blocks loaded by arv-mount should be kept in RAM
+        only or written to disk and memory-mapped.  The disk cache
+        leverages the kernel's virtual memory system so "hot" data will
+        generally still be kept in RAM.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
index b60d0ab1c..69c0ed6cf 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -363,3 +363,39 @@ $graph:
       jsonldPredicate:
         mapSubject: propertyName
         mapPredicate: propertyValue
+
+
+- name: KeepCacheType
+  type: enum
+  symbols:
+    - ram_cache
+    - disk_cache
+  doc:
+    - |
+        ram_cache: Keep blocks will be cached in RAM only.
+    - |
+        disk_cache: Keep blocks will be cached to disk and
+        memory-mapped.  The disk cache leverages the kernel's virtual
+        memory system so "hot" data will generally still be kept in
+        RAM.
+
+- name: KeepCacheTypeRequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Choose keep cache strategy.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:KeepCacheTypeRequirement'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: keepCacheType
+      type: KeepCacheType?
+      doc: |
+        Whether Keep blocks loaded by arv-mount should be kept in RAM
+        only or written to disk and memory-mapped.  The disk cache
+        leverages the kernel's virtual memory system so "hot" data will
+        generally still be kept in RAM.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
index 2769244a5..86cd06eff 100644
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -365,3 +365,39 @@ $graph:
       jsonldPredicate:
         mapSubject: propertyName
         mapPredicate: propertyValue
+
+
+- name: KeepCacheType
+  type: enum
+  symbols:
+    - ram_cache
+    - disk_cache
+  doc:
+    - |
+        ram_cache: Keep blocks will be cached in RAM only.
+    - |
+        disk_cache: Keep blocks will be cached to disk and
+        memory-mapped.  The disk cache leverages the kernel's virtual
+        memory system so "hot" data will generally still be kept in
+        RAM.
+
+- name: KeepCacheTypeRequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Choose keep cache strategy.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:KeepCacheTypeRequirement'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: keepCacheType
+      type: KeepCacheType?
+      doc: |
+        Whether Keep blocks loaded by arv-mount should be kept in RAM
+        only or written to disk and memory-mapped.  The disk cache
+        leverages the kernel's virtual memory system so "hot" data will
+        generally still be kept in RAM.
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 6fcf366e0..fde9db384 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -264,10 +264,15 @@ class ArvadosContainer(JobBase):
         if api_req:
             runtime_constraints["API"] = True
 
+        use_disk_cache = (self.arvrunner.api.config()["Containers"].get("DefaultKeepCacheDisk", 0) > 0)
+
         runtime_req, _ = self.get_requirement("http://arvados.org/cwl#RuntimeConstraints")
         if runtime_req:
+            if "keepCacheType" in runtime_req:
+                if cache_type == "ram_cache":
+                    use_disk_cache = False
             if "keep_cache" in runtime_req:
-                if self.arvrunner.api.config()["Containers"].get("DefaultKeepCacheDisk", 0) > 0:
+                if use_disk_cache:
                     # If DefaultKeepCacheDisk is non-zero it means we should use disk cache.
                     runtime_constraints["keep_cache_disk"] = math.ceil(runtime_req["keep_cache"] * 2**20)
                 else:
@@ -282,6 +287,13 @@ class ArvadosContainer(JobBase):
                         "writable": True
                     }
 
+        if use_disk_cache and "keep_cache_disk" not in runtime_constraints:
+            # Cache size wasn't explicitly set so calculate a default
+            # based on 2x RAM request or 1 GB per core, whichever is
+            # smaller.  This is to avoid requesting 100s of GB of disk
+            # cache when requesting a node with a huge amount of RAM.
+            runtime_constraints["keep_cache_disk"] = min(runtime_constraints["ram"] * 2, runtime_constraints["vcpus"] * (1024*1024*1024))
+
         partition_req, _ = self.get_requirement("http://arvados.org/cwl#PartitionRequirement")
         if partition_req:
             scheduling_parameters["partitions"] = aslist(partition_req["partition"])

-----------------------------------------------------------------------


hooks/post-receive
--