[ARVADOS] created: 1.3.0-2739-g34969e57b

Git user git at public.arvados.org
Mon Jun 29 20:55:55 UTC 2020


        at  34969e57b539593179c787cc832af821f2cd4c7e (commit)


commit 34969e57b539593179c787cc832af821f2cd4c7e
Author: Ward Vandewege <ward at curii.com>
Date:   Mon Jun 29 16:53:38 2020 -0400

    16573: first version of the deduplication report
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/.licenseignore b/.licenseignore
index ad80dc3f4..81f6b7181 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -79,4 +79,6 @@ lib/dispatchcloud/test/sshkey_*
 *.asc
 sdk/java-v2/build.gradle
 sdk/java-v2/settings.gradle
-sdk/cwl/tests/wf/feddemo
\ No newline at end of file
+sdk/cwl/tests/wf/feddemo
+go.mod
+go.sum
diff --git a/cmd/arvados-client/cmd.go b/cmd/arvados-client/cmd.go
index 887bc62bb..e99e73c73 100644
--- a/cmd/arvados-client/cmd.go
+++ b/cmd/arvados-client/cmd.go
@@ -10,6 +10,7 @@ import (
 	"git.arvados.org/arvados.git/lib/cli"
 	"git.arvados.org/arvados.git/lib/cmd"
 	"git.arvados.org/arvados.git/lib/mount"
+	"git.arvados.org/arvados.git/lib/deduplicationreport"
 )
 
 var (
@@ -52,7 +53,8 @@ var (
 		"virtual_machine":          cli.APICall,
 		"workflow":                 cli.APICall,
 
-		"mount": mount.Command,
+		"mount":                    mount.Command,
+		"deduplication-report":     deduplicationreport.Command,
 	})
 )
 
diff --git a/go.mod b/go.mod
index cc5457975..1fde587e6 100644
--- a/go.mod
+++ b/go.mod
@@ -22,6 +22,7 @@ require (
 	github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1
 	github.com/docker/go-connections v0.3.0 // indirect
 	github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d // indirect
+	github.com/dustin/go-humanize v1.0.0
 	github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect
 	github.com/fsnotify/fsnotify v1.4.9
 	github.com/ghodss/yaml v1.0.0
diff --git a/go.sum b/go.sum
index 38153ce3e..c9b7f74e3 100644
--- a/go.sum
+++ b/go.sum
@@ -56,6 +56,8 @@ github.com/docker/go-connections v0.3.0 h1:3lOnM9cSzgGwx8VfK/NGOW5fLQ0GjIlCkaktF
 github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d h1:dVaNRYvaGV23AdNdsm+4y1mPN0tj3/1v6taqKMmM6Ko=
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
 github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
diff --git a/lib/deduplicationreport/command.go b/lib/deduplicationreport/command.go
new file mode 100644
index 000000000..cc679ba90
--- /dev/null
+++ b/lib/deduplicationreport/command.go
@@ -0,0 +1,51 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package deduplicationreport
+
+import (
+  "io"
+  //"log"
+  //"net/http"
+  _ "net/http/pprof"
+  //"os"
+
+  //"git.arvados.org/arvados.git/sdk/go/arvados"
+  //"git.arvados.org/arvados.git/sdk/go/arvadosclient"
+  //"git.arvados.org/arvados.git/sdk/go/keepclient"
+
+  "git.arvados.org/arvados.git/lib/config"
+  "git.arvados.org/arvados.git/sdk/go/ctxlog"
+  "github.com/sirupsen/logrus"
+)
+
+var Command command
+
+type command struct{}
+
+type NoPrefixFormatter struct {}
+
+func (f *NoPrefixFormatter) Format(entry *logrus.Entry) ([]byte, error) {
+  return []byte(entry.Message), nil
+}
+
+// RunCommand implements the subcommand "deduplication-report <collection> <collection> ..."
+func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+  var err error
+  logger := ctxlog.New(stderr, "text", "info")
+  defer func() {
+    if err != nil {
+      logger.WithError(err).Error("fatal")
+    }
+  }()
+
+  logger.SetFormatter(new(NoPrefixFormatter))
+
+  loader := config.NewLoader(stdin, logger)
+  loader.SkipLegacy = true
+
+  exitcode := report(prog, args, loader, logger, stderr)
+
+  return exitcode
+}
diff --git a/lib/deduplicationreport/report.go b/lib/deduplicationreport/report.go
new file mode 100644
index 000000000..a7d45cd31
--- /dev/null
+++ b/lib/deduplicationreport/report.go
@@ -0,0 +1,216 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+  "flag"
+  "fmt"
+  //"log"
+  "io"
+  "net/http"
+  _ "net/http/pprof"
+  //"os"
+  "strings"
+
+  "git.arvados.org/arvados.git/lib/config"
+  "git.arvados.org/arvados.git/sdk/go/arvados"
+  "git.arvados.org/arvados.git/sdk/go/arvadosclient"
+  "git.arvados.org/arvados.git/sdk/go/manifest"
+
+  "github.com/dustin/go-humanize"
+  "github.com/sirupsen/logrus"
+)
+
+func deDuplicate(inputs []string) (trimmed []string) {
+  seen := make(map[string]bool)
+  for _, uuid := range inputs {
+    if _, ok := seen[uuid]; ! ok {
+      seen[uuid] = true
+      trimmed = append(trimmed, uuid)
+    }
+  }
+  return
+}
+
+func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
+  flags := flag.NewFlagSet("", flag.ContinueOnError)
+  flags.SetOutput(stderr)
+  flags.Usage = func() {
+    fmt.Fprintf(flags.Output(), `
+Usage:
+  %s [options ...] <collection-uuid> <collection-uuid> ...
+
+  %s [options ...] <collection-pdh>,<collection_uuid> \
+     <collection-pdh>,<collection_uuid> ...
+
+  This program analyzes the overlap in blocks used by 2 or more collections. It
+  prints a deduplication report that shows the nominal space used by the list
+  of collection, as well as the actual size and the amount of space that is
+  saved by Keep's deduplication.
+
+  The list of collections may be provided in two ways. A list of collection
+  uuids is sufficient. Alternatively, the PDH for each collection may also be
+  provided. This is will greatly speed up operation when the list contains
+  multiple collections with the same PDH.
+
+  Exit status will be zero if there were no errors generating the report.
+
+Example:
+
+  Use the 'arv' and 'jq' commands to get the list of the 100
+  largest collections and generate the deduplication report:
+
+  arv collection list --order 'file_size_total desc' | \
+    jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
+    tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \
+    xargs %s
+
+Options:
+`, prog, prog, prog)
+    flags.PrintDefaults()
+  }
+  loader.SetupFlags(flags)
+  loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
+  err := flags.Parse(args)
+  if err == flag.ErrHelp {
+    err = nil
+    return 0, inputs
+  } else if err != nil {
+    return 2, inputs
+  }
+
+  inputs = flags.Args()
+
+  inputs = deDuplicate(inputs)
+
+  if len(inputs) < 2 {
+    flags.Usage()
+    return 2, inputs
+  }
+
+  lvl, err := logrus.ParseLevel(*loglevel)
+  if err != nil {
+    return 2, inputs
+  }
+  logger.SetLevel(lvl)
+  return
+}
+
+func blockList(collection arvados.Collection) (blocks map[string]int) {
+  blocks = make(map[string]int)
+  m := manifest.Manifest { Text: collection.ManifestText }
+  blockChannel := m.BlockIterWithDuplicates()
+  for b := range blockChannel {
+    blocks[b.Digest.String()] = b.Size
+  }
+  return
+}
+
+func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int) {
+
+  var inputs []string
+  exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
+  if exitcode != 0 {
+    return
+  }
+
+  go func() {
+    logger.Info(http.ListenAndServe("localhost:6070", nil))
+  }()
+
+  // Arvados Client setup
+  arv, err := arvadosclient.MakeArvadosClient()
+  if err != nil {
+    logger.Error("error creating Arvados object: %s", err)
+    exitcode = 1
+    return
+  }
+
+  type Col struct {
+    uuid string
+    FileSizeTotal int64
+    FileCount int64
+  }
+
+  blocks := make(map[string]map[string]int)
+  pdhs := make(map[string]Col)
+  var nominalSize int64
+
+  logger.Info("\n")
+  for _, input := range inputs {
+    var uuid string
+    var pdh string
+    if strings.Contains(input, ",") {
+      // The input is in the format pdh,uuid. This will allow us to safe time on duplicate pdh's
+      tmp := strings.Split(input, ",")
+      pdh = tmp[0]
+      uuid = tmp[1]
+    } else {
+      // The input must be a plain uuid
+      uuid = input
+    }
+    if ! strings.Contains(uuid, "-4zz18-") {
+      logger.Error("uuid must refer to collection object")
+      exitcode = 1
+      return
+    }
+    if _, ok := pdhs[pdh]; ok {
+      // We've processed the collection with the pdh already. Simply add its
+      // size to the totals and move on to the next one.
+      nominalSize += pdhs[pdh].FileSizeTotal
+    } else {
+      var collection arvados.Collection
+      err = arv.Get("collections", uuid, nil, &collection)
+      if err != nil {
+        logger.Error("error getting collection: %s\n", err)
+        exitcode = 1
+        return
+      }
+      blocks[uuid] = make(map[string]int)
+      blocks[uuid] = blockList(collection)
+
+      col := Col{}
+      if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
+        nominalSize += collection.FileSizeTotal
+        col.FileSizeTotal = collection.FileSizeTotal
+        col.FileCount = int64(collection.FileCount)
+      } else {
+        // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
+        var collSize int64
+        for _, size := range blocks[uuid] {
+          collSize += int64(size)
+        }
+        nominalSize += collSize
+        col.FileSizeTotal = collSize
+      }
+      pdhs[pdh] = col
+    }
+
+    if pdhs[pdh].FileCount != 0 {
+      logger.Infof("Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
+    } else {
+      logger.Infof("Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
+    }
+
+  }
+
+  var totalSize int64
+  seen := make(map[string]bool)
+  for _, v := range blocks {
+    for pdh, size := range v {
+      if _, ok := seen[pdh]; !ok {
+        seen[pdh] = true
+        totalSize += int64(size)
+      }
+    }
+  }
+  logger.Info("\n")
+  logger.Infof("Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
+  logger.Infof("Actual size of stored data:  %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
+  logger.Infof("Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))
+
+  return exitcode
+}
+

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list