[ARVADOS] updated: 1.3.0-2739-g0dd614e11
Git user
git at public.arvados.org
Wed Jul 8 18:20:03 UTC 2020
Summary of changes:
lib/deduplicationreport/report.go | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
discards ec0cb62eaed4c48d38f996acbab7b61f05a6aeb2 (commit)
via 0dd614e11abb96e6469d8203f7e73d113b64e323 (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (ec0cb62eaed4c48d38f996acbab7b61f05a6aeb2)
\
N -- N -- N (0dd614e11abb96e6469d8203f7e73d113b64e323)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 0dd614e11abb96e6469d8203f7e73d113b64e323
Author: Ward Vandewege <ward at curii.com>
Date: Mon Jun 29 16:53:38 2020 -0400
16573: add a deduplication-report command to arvados-client
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward at curii.com>
diff --git a/.licenseignore b/.licenseignore
index ad80dc3f4..81f6b7181 100644
--- a/.licenseignore
+++ b/.licenseignore
@@ -79,4 +79,6 @@ lib/dispatchcloud/test/sshkey_*
*.asc
sdk/java-v2/build.gradle
sdk/java-v2/settings.gradle
-sdk/cwl/tests/wf/feddemo
\ No newline at end of file
+sdk/cwl/tests/wf/feddemo
+go.mod
+go.sum
diff --git a/cmd/arvados-client/cmd.go b/cmd/arvados-client/cmd.go
index 887bc62bb..bcc3dda09 100644
--- a/cmd/arvados-client/cmd.go
+++ b/cmd/arvados-client/cmd.go
@@ -9,6 +9,7 @@ import (
"git.arvados.org/arvados.git/lib/cli"
"git.arvados.org/arvados.git/lib/cmd"
+ "git.arvados.org/arvados.git/lib/deduplicationreport"
"git.arvados.org/arvados.git/lib/mount"
)
@@ -52,7 +53,8 @@ var (
"virtual_machine": cli.APICall,
"workflow": cli.APICall,
- "mount": mount.Command,
+ "mount": mount.Command,
+ "deduplication-report": deduplicationreport.Command,
})
)
diff --git a/go.mod b/go.mod
index cc5457975..1fde587e6 100644
--- a/go.mod
+++ b/go.mod
@@ -22,6 +22,7 @@ require (
github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1
github.com/docker/go-connections v0.3.0 // indirect
github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d // indirect
+ github.com/dustin/go-humanize v1.0.0
github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect
github.com/fsnotify/fsnotify v1.4.9
github.com/ghodss/yaml v1.0.0
diff --git a/go.sum b/go.sum
index 38153ce3e..c9b7f74e3 100644
--- a/go.sum
+++ b/go.sum
@@ -56,6 +56,8 @@ github.com/docker/go-connections v0.3.0 h1:3lOnM9cSzgGwx8VfK/NGOW5fLQ0GjIlCkaktF
github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d h1:dVaNRYvaGV23AdNdsm+4y1mPN0tj3/1v6taqKMmM6Ko=
github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ=
github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
diff --git a/lib/deduplicationreport/command.go b/lib/deduplicationreport/command.go
new file mode 100644
index 000000000..1199bc0ae
--- /dev/null
+++ b/lib/deduplicationreport/command.go
@@ -0,0 +1,43 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package deduplicationreport
+
+import (
+ "io"
+
+ "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/sirupsen/logrus"
+)
+
+var Command command
+
+type command struct{}
+
+type NoPrefixFormatter struct{}
+
+func (f *NoPrefixFormatter) Format(entry *logrus.Entry) ([]byte, error) {
+ return []byte(entry.Message), nil
+}
+
+// RunCommand implements the subcommand "deduplication-report <collection> <collection> ..."
+func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+ var err error
+ logger := ctxlog.New(stderr, "text", "info")
+ defer func() {
+ if err != nil {
+ logger.WithError(err).Error("fatal")
+ }
+ }()
+
+ logger.SetFormatter(new(NoPrefixFormatter))
+
+ loader := config.NewLoader(stdin, logger)
+ loader.SkipLegacy = true
+
+ exitcode := report(prog, args, loader, logger, stdout, stderr)
+
+ return exitcode
+}
diff --git a/lib/deduplicationreport/report.go b/lib/deduplicationreport/report.go
new file mode 100644
index 000000000..b7699fcb2
--- /dev/null
+++ b/lib/deduplicationreport/report.go
@@ -0,0 +1,217 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+ "flag"
+ "fmt"
+ "io"
+ "strings"
+
+ "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/arvadosclient"
+ "git.arvados.org/arvados.git/sdk/go/manifest"
+
+ "github.com/dustin/go-humanize"
+ "github.com/sirupsen/logrus"
+)
+
+func deDuplicate(inputs []string) (trimmed []string) {
+ seen := make(map[string]bool)
+ for _, uuid := range inputs {
+ if _, ok := seen[uuid]; !ok {
+ seen[uuid] = true
+ trimmed = append(trimmed, uuid)
+ }
+ }
+ return
+}
+
+func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
+ flags := flag.NewFlagSet("", flag.ContinueOnError)
+ flags.SetOutput(stderr)
+ flags.Usage = func() {
+ fmt.Fprintf(flags.Output(), `
+Usage:
+ %s [options ...] <collection-uuid> <collection-uuid> ...
+
+ %s [options ...] <collection-pdh>,<collection_uuid> \
+ <collection-pdh>,<collection_uuid> ...
+
+ This program analyzes the overlap in blocks used by 2 or more collections. It
+ prints a deduplication report that shows the nominal space used by the list
+ of collection, as well as the actual size and the amount of space that is
+ saved by Keep's deduplication.
+
+ The list of collections may be provided in two ways. A list of collection
+ uuids is sufficient. Alternatively, the PDH for each collection may also be
+ provided. This is will greatly speed up operation when the list contains
+ multiple collections with the same PDH.
+
+ Exit status will be zero if there were no errors generating the report.
+
+Example:
+
+ Use the 'arv' and 'jq' commands to get the list of the 100
+ largest collections and generate the deduplication report:
+
+ arv collection list --order 'file_size_total desc' | \
+ jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
+ tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \
+ xargs %s
+
+Options:
+`, prog, prog, prog)
+ flags.PrintDefaults()
+ }
+ loader.SetupFlags(flags)
+ loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
+ err := flags.Parse(args)
+ if err == flag.ErrHelp {
+ return 0, inputs
+ } else if err != nil {
+ return 2, inputs
+ }
+
+ inputs = flags.Args()
+
+ inputs = deDuplicate(inputs)
+
+ if len(inputs) < 2 {
+ logger.Error("Error: at least 2 different collections UUIDs required")
+ flags.Usage()
+ return 2, inputs
+ }
+
+ lvl, err := logrus.ParseLevel(*loglevel)
+ if err != nil {
+ return 2, inputs
+ }
+ logger.SetLevel(lvl)
+ return
+}
+
+func blockList(collection arvados.Collection) (blocks map[string]int) {
+ blocks = make(map[string]int)
+ m := manifest.Manifest{Text: collection.ManifestText}
+ blockChannel := m.BlockIterWithDuplicates()
+ for b := range blockChannel {
+ blocks[b.Digest.String()] = b.Size
+ }
+ return
+}
+
+func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
+
+ var inputs []string
+ exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
+ if exitcode != 0 {
+ return
+ }
+
+ // Arvados Client setup
+ arv, err := arvadosclient.MakeArvadosClient()
+ if err != nil {
+ logger.Errorf("error creating Arvados object: %s", err)
+ exitcode = 1
+ return
+ }
+
+ type Col struct {
+ FileSizeTotal int64
+ FileCount int64
+ }
+
+ blocks := make(map[string]map[string]int)
+ pdhs := make(map[string]Col)
+ var nominalSize int64
+
+ fmt.Println()
+ for _, input := range inputs {
+ var uuid string
+ var pdh string
+ if strings.Contains(input, ",") {
+ // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
+ tmp := strings.Split(input, ",")
+ pdh = tmp[0]
+ uuid = tmp[1]
+ } else {
+ // The input must be a plain uuid
+ uuid = input
+ }
+ if !strings.Contains(uuid, "-4zz18-") {
+ logger.Error("uuid must refer to collection object")
+ exitcode = 1
+ return
+ }
+ if _, ok := pdhs[pdh]; ok {
+ // We've processed a collection with this pdh already. Simply add its
+ // size to the totals and move on to the next one.
+ // Note that we simply trust the PDH matches the collection UUID here,
+ // in other words, we use it over the UUID. If they don't match, the report
+ // will be wrong.
+ nominalSize += pdhs[pdh].FileSizeTotal
+ } else {
+ var collection arvados.Collection
+ err = arv.Get("collections", uuid, nil, &collection)
+ if err != nil {
+ logger.Errorf("Error: unable to retrieve collection: %s\n", err)
+ exitcode = 1
+ return
+ }
+ blocks[uuid] = make(map[string]int)
+ blocks[uuid] = blockList(collection)
+ if pdh != "" && collection.PortableDataHash != pdh {
+ logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s\n", uuid, collection.PortableDataHash, pdh)
+ exitcode = 1
+ return
+ }
+ if pdh == "" {
+ pdh = collection.PortableDataHash
+ }
+
+ col := Col{}
+ if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
+ nominalSize += collection.FileSizeTotal
+ col.FileSizeTotal = collection.FileSizeTotal
+ col.FileCount = int64(collection.FileCount)
+ } else {
+ // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
+ var collSize int64
+ for _, size := range blocks[uuid] {
+ collSize += int64(size)
+ }
+ nominalSize += collSize
+ col.FileSizeTotal = collSize
+ }
+ pdhs[pdh] = col
+ }
+
+ if pdhs[pdh].FileCount != 0 {
+ fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
+ } else {
+ fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
+ }
+ }
+
+ var totalSize int64
+ seen := make(map[string]bool)
+ for _, v := range blocks {
+ for pdh, size := range v {
+ if _, ok := seen[pdh]; !ok {
+ seen[pdh] = true
+ totalSize += int64(size)
+ }
+ }
+ }
+ fmt.Fprintln(stdout)
+ fmt.Fprintf(stdout, "Collections: %15d\n", len(inputs))
+ fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
+ fmt.Fprintf(stdout, "Actual size of stored data: %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
+ fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))
+
+ return exitcode
+}
diff --git a/lib/deduplicationreport/report_test.go b/lib/deduplicationreport/report_test.go
new file mode 100644
index 000000000..cc6c05be3
--- /dev/null
+++ b/lib/deduplicationreport/report_test.go
@@ -0,0 +1,131 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+ "bytes"
+ // "encoding/json"
+ // "io/ioutil"
+ // "os"
+ "testing"
+ // "time"
+
+ // "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/arvadostest"
+ //"git.arvados.org/arvados.git/sdk/go/arvadosclient"
+ // "git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
+
+var _ = check.Suite(&Suite{})
+
+type Suite struct{}
+
+func (s *Suite) TearDownSuite(c *check.C) {
+ // Undo any changes/additions to the database so they don't affect subsequent tests.
+ arvadostest.ResetEnv()
+}
+
+func (*Suite) TestUsage(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{"-log-level=debug"}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 2)
+ c.Check(stdout.String(), check.Equals, "")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Matches, `(?ms).*Usage:.*`)
+}
+
+func (*Suite) TestTwoIdenticalUUIDs(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with 2 identical uuids
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 2)
+ c.Check(stdout.String(), check.Equals, "")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Matches, `(?ms).*Error: at least 2 different collections UUIDs required.*`)
+}
+
+func (*Suite) TestTwoUUIDsInvalidPDH(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with pdh,uuid where pdh does not match
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooAndBarFilesInDirPDH + "," + arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 1)
+ c.Check(stdout.String(), check.Equals, "")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Matches, `(?ms).*Error: the collection with UUID zzzzz-4zz18-fy296fx3hot09f7 has PDH 1f4b0bc7583c2a7f9102c395f4ffc5e3\+45, but a different PDH was provided in the arguments: 870369fc72738603c2fad16664e50e2d\+58.*`)
+}
+
+func (*Suite) TestNonExistentCollection(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with many UUIDs
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.NonexistentCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 1)
+ c.Check(stdout.String(), check.Equals, "Collection zzzzz-4zz18-fy296fx3hot09f7: pdh 1f4b0bc7583c2a7f9102c395f4ffc5e3+45; nominal size 3 (3 B)\n")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Matches, `(?ms).*Error: unable to retrieve collection:.*404 Not Found.*`)
+}
+
+func (*Suite) TestManyUUIDsNoOverlap(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with 5 UUIDs
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.HelloWorldCollection, arvadostest.FooBarDirCollection, arvadostest.WazVersion1Collection, arvadostest.UserAgreementCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 0)
+ c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Equals, "")
+}
+
+func (*Suite) TestTwoOverlappingCollections(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Create two collections
+ arv := arvados.NewClientFromEnv()
+
+ var c1 arvados.Collection
+ err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406 at 5f0de808 0:4:foo\n"}})
+ c.Assert(err, check.Equals, nil)
+
+ var c2 arvados.Collection
+ err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139 at 5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2 at 5f0e0bf9 0:4:bar 4:4:foo\n"}})
+ c.Assert(err, check.Equals, nil)
+
+ // Run dedupreport with 2 arguments: uuid uuid
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 0)
+ c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Equals, "")
+}
+
+func (*Suite) TestTwoOverlappingCollectionsWithPDH(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Create two collections
+ arv := arvados.NewClientFromEnv()
+
+ var c1 arvados.Collection
+ err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406 at 5f0de808 0:4:foo\n"}})
+ c.Assert(err, check.Equals, nil)
+
+ var c2 arvados.Collection
+ err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139 at 5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2 at 5f0e0bf9 0:4:bar 4:4:foo\n"}})
+ c.Assert(err, check.Equals, nil)
+
+ // Run dedupreport with 2 arguments: pdh,uuid uuid
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.PortableDataHash + "," + c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 0)
+ c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Equals, "")
+}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list