[ARVADOS] updated: 1.3.0-2739-g0dd614e11

Git user git at public.arvados.org
Wed Jul 8 18:20:03 UTC 2020


Summary of changes:
 lib/deduplicationreport/report.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

  discards  ec0cb62eaed4c48d38f996acbab7b61f05a6aeb2 (commit)
       via  0dd614e11abb96e6469d8203f7e73d113b64e323 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (ec0cb62eaed4c48d38f996acbab7b61f05a6aeb2)
            \
             N -- N -- N (0dd614e11abb96e6469d8203f7e73d113b64e323)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 0dd614e11abb96e6469d8203f7e73d113b64e323
Author: Ward Vandewege <ward at curii.com>
Date:   Mon Jun 29 16:53:38 2020 -0400

    16573: add a deduplication-report command to arvados-client
    
    Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>

diff --git a/.licenseignore b/.licenseignore
index ad80dc3f4..81f6b7181 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -79,4 +79,6 @@ lib/dispatchcloud/test/sshkey_*
 *.asc
 sdk/java-v2/build.gradle
 sdk/java-v2/settings.gradle
-sdk/cwl/tests/wf/feddemo
\ No newline at end of file
+sdk/cwl/tests/wf/feddemo
+go.mod
+go.sum
diff --git a/cmd/arvados-client/cmd.go b/cmd/arvados-client/cmd.go
index 887bc62bb..bcc3dda09 100644
--- a/cmd/arvados-client/cmd.go
+++ b/cmd/arvados-client/cmd.go
@@ -9,6 +9,7 @@ import (
 
 	"git.arvados.org/arvados.git/lib/cli"
 	"git.arvados.org/arvados.git/lib/cmd"
+	"git.arvados.org/arvados.git/lib/deduplicationreport"
 	"git.arvados.org/arvados.git/lib/mount"
 )
 
@@ -52,7 +53,8 @@ var (
 		"virtual_machine":          cli.APICall,
 		"workflow":                 cli.APICall,
 
-		"mount": mount.Command,
+		"mount":                mount.Command,
+		"deduplication-report": deduplicationreport.Command,
 	})
 )
 
diff --git a/go.mod b/go.mod
index cc5457975..1fde587e6 100644
--- a/go.mod
+++ b/go.mod
@@ -22,6 +22,7 @@ require (
 	github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1
 	github.com/docker/go-connections v0.3.0 // indirect
 	github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d // indirect
+	github.com/dustin/go-humanize v1.0.0
 	github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect
 	github.com/fsnotify/fsnotify v1.4.9
 	github.com/ghodss/yaml v1.0.0
diff --git a/go.sum b/go.sum
index 38153ce3e..c9b7f74e3 100644
--- a/go.sum
+++ b/go.sum
@@ -56,6 +56,8 @@ github.com/docker/go-connections v0.3.0 h1:3lOnM9cSzgGwx8VfK/NGOW5fLQ0GjIlCkaktF
 github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d h1:dVaNRYvaGV23AdNdsm+4y1mPN0tj3/1v6taqKMmM6Ko=
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
 github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
diff --git a/lib/deduplicationreport/command.go b/lib/deduplicationreport/command.go
new file mode 100644
index 000000000..1199bc0ae
--- /dev/null
+++ b/lib/deduplicationreport/command.go
@@ -0,0 +1,43 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package deduplicationreport
+
+import (
+	"io"
+
+	"git.arvados.org/arvados.git/lib/config"
+	"git.arvados.org/arvados.git/sdk/go/ctxlog"
+	"github.com/sirupsen/logrus"
+)
+
+var Command command
+
+type command struct{}
+
+type NoPrefixFormatter struct{}
+
+func (f *NoPrefixFormatter) Format(entry *logrus.Entry) ([]byte, error) {
+	return []byte(entry.Message), nil
+}
+
+// RunCommand implements the subcommand "deduplication-report <collection> <collection> ..."
+func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+	var err error
+	logger := ctxlog.New(stderr, "text", "info")
+	defer func() {
+		if err != nil {
+			logger.WithError(err).Error("fatal")
+		}
+	}()
+
+	logger.SetFormatter(new(NoPrefixFormatter))
+
+	loader := config.NewLoader(stdin, logger)
+	loader.SkipLegacy = true
+
+	exitcode := report(prog, args, loader, logger, stdout, stderr)
+
+	return exitcode
+}
diff --git a/lib/deduplicationreport/report.go b/lib/deduplicationreport/report.go
new file mode 100644
index 000000000..b7699fcb2
--- /dev/null
+++ b/lib/deduplicationreport/report.go
@@ -0,0 +1,217 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"strings"
+
+	"git.arvados.org/arvados.git/lib/config"
+	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/sdk/go/arvadosclient"
+	"git.arvados.org/arvados.git/sdk/go/manifest"
+
+	"github.com/dustin/go-humanize"
+	"github.com/sirupsen/logrus"
+)
+
+func deDuplicate(inputs []string) (trimmed []string) {
+	seen := make(map[string]bool)
+	for _, uuid := range inputs {
+		if _, ok := seen[uuid]; !ok {
+			seen[uuid] = true
+			trimmed = append(trimmed, uuid)
+		}
+	}
+	return
+}
+
+func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
+	flags := flag.NewFlagSet("", flag.ContinueOnError)
+	flags.SetOutput(stderr)
+	flags.Usage = func() {
+		fmt.Fprintf(flags.Output(), `
+Usage:
+  %s [options ...] <collection-uuid> <collection-uuid> ...
+
+  %s [options ...] <collection-pdh>,<collection_uuid> \
+     <collection-pdh>,<collection_uuid> ...
+
+  This program analyzes the overlap in blocks used by 2 or more collections. It
+  prints a deduplication report that shows the nominal space used by the list
+  of collection, as well as the actual size and the amount of space that is
+  saved by Keep's deduplication.
+
+  The list of collections may be provided in two ways. A list of collection
+  uuids is sufficient. Alternatively, the PDH for each collection may also be
+  provided. This is will greatly speed up operation when the list contains
+  multiple collections with the same PDH.
+
+  Exit status will be zero if there were no errors generating the report.
+
+Example:
+
+  Use the 'arv' and 'jq' commands to get the list of the 100
+  largest collections and generate the deduplication report:
+
+  arv collection list --order 'file_size_total desc' | \
+    jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
+    tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \
+    xargs %s
+
+Options:
+`, prog, prog, prog)
+		flags.PrintDefaults()
+	}
+	loader.SetupFlags(flags)
+	loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
+	err := flags.Parse(args)
+	if err == flag.ErrHelp {
+		return 0, inputs
+	} else if err != nil {
+		return 2, inputs
+	}
+
+	inputs = flags.Args()
+
+	inputs = deDuplicate(inputs)
+
+	if len(inputs) < 2 {
+		logger.Error("Error: at least 2 different collections UUIDs required")
+		flags.Usage()
+		return 2, inputs
+	}
+
+	lvl, err := logrus.ParseLevel(*loglevel)
+	if err != nil {
+		return 2, inputs
+	}
+	logger.SetLevel(lvl)
+	return
+}
+
+func blockList(collection arvados.Collection) (blocks map[string]int) {
+	blocks = make(map[string]int)
+	m := manifest.Manifest{Text: collection.ManifestText}
+	blockChannel := m.BlockIterWithDuplicates()
+	for b := range blockChannel {
+		blocks[b.Digest.String()] = b.Size
+	}
+	return
+}
+
+func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
+
+	var inputs []string
+	exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
+	if exitcode != 0 {
+		return
+	}
+
+	// Arvados Client setup
+	arv, err := arvadosclient.MakeArvadosClient()
+	if err != nil {
+		logger.Errorf("error creating Arvados object: %s", err)
+		exitcode = 1
+		return
+	}
+
+	type Col struct {
+		FileSizeTotal int64
+		FileCount     int64
+	}
+
+	blocks := make(map[string]map[string]int)
+	pdhs := make(map[string]Col)
+	var nominalSize int64
+
+	fmt.Println()
+	for _, input := range inputs {
+		var uuid string
+		var pdh string
+		if strings.Contains(input, ",") {
+			// The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
+			tmp := strings.Split(input, ",")
+			pdh = tmp[0]
+			uuid = tmp[1]
+		} else {
+			// The input must be a plain uuid
+			uuid = input
+		}
+		if !strings.Contains(uuid, "-4zz18-") {
+			logger.Error("uuid must refer to collection object")
+			exitcode = 1
+			return
+		}
+		if _, ok := pdhs[pdh]; ok {
+			// We've processed a collection with this pdh already. Simply add its
+			// size to the totals and move on to the next one.
+			// Note that we simply trust the PDH matches the collection UUID here,
+			// in other words, we use it over the UUID. If they don't match, the report
+			// will be wrong.
+			nominalSize += pdhs[pdh].FileSizeTotal
+		} else {
+			var collection arvados.Collection
+			err = arv.Get("collections", uuid, nil, &collection)
+			if err != nil {
+				logger.Errorf("Error: unable to retrieve collection: %s\n", err)
+				exitcode = 1
+				return
+			}
+			blocks[uuid] = make(map[string]int)
+			blocks[uuid] = blockList(collection)
+			if pdh != "" && collection.PortableDataHash != pdh {
+				logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s\n", uuid, collection.PortableDataHash, pdh)
+				exitcode = 1
+				return
+			}
+			if pdh == "" {
+				pdh = collection.PortableDataHash
+			}
+
+			col := Col{}
+			if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
+				nominalSize += collection.FileSizeTotal
+				col.FileSizeTotal = collection.FileSizeTotal
+				col.FileCount = int64(collection.FileCount)
+			} else {
+				// Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
+				var collSize int64
+				for _, size := range blocks[uuid] {
+					collSize += int64(size)
+				}
+				nominalSize += collSize
+				col.FileSizeTotal = collSize
+			}
+			pdhs[pdh] = col
+		}
+
+		if pdhs[pdh].FileCount != 0 {
+			fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
+		} else {
+			fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
+		}
+	}
+
+	var totalSize int64
+	seen := make(map[string]bool)
+	for _, v := range blocks {
+		for pdh, size := range v {
+			if _, ok := seen[pdh]; !ok {
+				seen[pdh] = true
+				totalSize += int64(size)
+			}
+		}
+	}
+	fmt.Fprintln(stdout)
+	fmt.Fprintf(stdout, "Collections:                 %15d\n", len(inputs))
+	fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
+	fmt.Fprintf(stdout, "Actual size of stored data:  %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
+	fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))
+
+	return exitcode
+}
diff --git a/lib/deduplicationreport/report_test.go b/lib/deduplicationreport/report_test.go
new file mode 100644
index 000000000..cc6c05be3
--- /dev/null
+++ b/lib/deduplicationreport/report_test.go
@@ -0,0 +1,131 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+	"bytes"
+	//  "encoding/json"
+	//  "io/ioutil"
+	//  "os"
+	"testing"
+	//  "time"
+
+	//  "git.arvados.org/arvados.git/lib/config"
+	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/sdk/go/arvadostest"
+	//"git.arvados.org/arvados.git/sdk/go/arvadosclient"
+	//  "git.arvados.org/arvados.git/sdk/go/ctxlog"
+	"gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+	check.TestingT(t)
+}
+
+var _ = check.Suite(&Suite{})
+
+type Suite struct{}
+
+func (s *Suite) TearDownSuite(c *check.C) {
+	// Undo any changes/additions to the database so they don't affect subsequent tests.
+	arvadostest.ResetEnv()
+}
+
+func (*Suite) TestUsage(c *check.C) {
+	var stdout, stderr bytes.Buffer
+	exitcode := Command.RunCommand("deduplicationreport.test", []string{"-log-level=debug"}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 2)
+	c.Check(stdout.String(), check.Equals, "")
+	c.Log(stderr.String())
+	c.Check(stderr.String(), check.Matches, `(?ms).*Usage:.*`)
+}
+
+func (*Suite) TestTwoIdenticalUUIDs(c *check.C) {
+	var stdout, stderr bytes.Buffer
+	// Run dedupreport with 2 identical uuids
+	exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 2)
+	c.Check(stdout.String(), check.Equals, "")
+	c.Log(stderr.String())
+	c.Check(stderr.String(), check.Matches, `(?ms).*Error: at least 2 different collections UUIDs required.*`)
+}
+
+func (*Suite) TestTwoUUIDsInvalidPDH(c *check.C) {
+	var stdout, stderr bytes.Buffer
+	// Run dedupreport with pdh,uuid where pdh does not match
+	exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooAndBarFilesInDirPDH + "," + arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 1)
+	c.Check(stdout.String(), check.Equals, "")
+	c.Log(stderr.String())
+	c.Check(stderr.String(), check.Matches, `(?ms).*Error: the collection with UUID zzzzz-4zz18-fy296fx3hot09f7 has PDH 1f4b0bc7583c2a7f9102c395f4ffc5e3\+45, but a different PDH was provided in the arguments: 870369fc72738603c2fad16664e50e2d\+58.*`)
+}
+
+func (*Suite) TestNonExistentCollection(c *check.C) {
+	var stdout, stderr bytes.Buffer
+	// Run dedupreport with many UUIDs
+	exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.NonexistentCollection}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 1)
+	c.Check(stdout.String(), check.Equals, "Collection zzzzz-4zz18-fy296fx3hot09f7: pdh 1f4b0bc7583c2a7f9102c395f4ffc5e3+45; nominal size 3 (3 B)\n")
+	c.Log(stderr.String())
+	c.Check(stderr.String(), check.Matches, `(?ms).*Error: unable to retrieve collection:.*404 Not Found.*`)
+}
+
+func (*Suite) TestManyUUIDsNoOverlap(c *check.C) {
+	var stdout, stderr bytes.Buffer
+	// Run dedupreport with 5 UUIDs
+	exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.HelloWorldCollection, arvadostest.FooBarDirCollection, arvadostest.WazVersion1Collection, arvadostest.UserAgreementCollection}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 0)
+	c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+	c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+	c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
+	c.Log(stderr.String())
+	c.Check(stderr.String(), check.Equals, "")
+}
+
+func (*Suite) TestTwoOverlappingCollections(c *check.C) {
+	var stdout, stderr bytes.Buffer
+	// Create two collections
+	arv := arvados.NewClientFromEnv()
+
+	var c1 arvados.Collection
+	err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406 at 5f0de808 0:4:foo\n"}})
+	c.Assert(err, check.Equals, nil)
+
+	var c2 arvados.Collection
+	err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139 at 5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2 at 5f0e0bf9 0:4:bar 4:4:foo\n"}})
+	c.Assert(err, check.Equals, nil)
+
+	// Run dedupreport with 2 arguments: uuid uuid
+	exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 0)
+	c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
+	c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
+	c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
+	c.Log(stderr.String())
+	c.Check(stderr.String(), check.Equals, "")
+}
+
+func (*Suite) TestTwoOverlappingCollectionsWithPDH(c *check.C) {
+	var stdout, stderr bytes.Buffer
+	// Create two collections
+	arv := arvados.NewClientFromEnv()
+
+	var c1 arvados.Collection
+	err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406 at 5f0de808 0:4:foo\n"}})
+	c.Assert(err, check.Equals, nil)
+
+	var c2 arvados.Collection
+	err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139 at 5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2 at 5f0e0bf9 0:4:bar 4:4:foo\n"}})
+	c.Assert(err, check.Equals, nil)
+
+	// Run dedupreport with 2 arguments: pdh,uuid uuid
+	exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.PortableDataHash + "," + c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr)
+	c.Check(exitcode, check.Equals, 0)
+	c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
+	c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
+	c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
+	c.Log(stderr.String())
+	c.Check(stderr.String(), check.Equals, "")
+}

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list