Skip to content
This repository was archived by the owner on Mar 27, 2024. It is now read-only.

Use top_level.txt when analyzing pip modules #291

Merged
merged 1 commit into from
Jan 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 35 additions & 9 deletions differs/pip_diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ limitations under the License.
package differs

import (
"bufio"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"strings"
Expand Down Expand Up @@ -92,18 +94,42 @@ func (a PipAnalyzer) getPackages(image pkgutil.Image) (map[string]map[string]uti
packageName := packageMatch[1]
version := packageMatch[2][:len(packageMatch[2])-1]

// Retrieves size for actual package/script corresponding to each dist-info metadata directory
// by taking the file entry alphabetically before it (for a package) or after it (for a script)
// First, try and use the "top_level.txt",
// Many egg packages contains a "top_level.txt" file describing the directories containing the
Copy link

@weakcamel weakcamel Jan 24, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://setuptools.readthedocs.io/en/latest/formats.html

The minimum project metadata that all eggs must have is a standard Python PKG-INFO file, named PKG-INFO and placed within the metadata directory appropriate to the format.
...
In addition to the PKG-INFO file, an egg’s metadata directory may also include files and directories representing various forms of optional standard metadata ...

And
https://www.python.org/dev/peps/pep-0427/#the-dist-info-directory

  1. Wheel .dist-info directories include at a minimum METADATA, WHEEL, and RECORD.
  2. METADATA is the package metadata, the same format as PKG-INFO as found at the root of sdis

So from the sound of if, it's either one or the other.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice, these could definitely be useful for getting the package name. however I don't see anywhere in the METADATA files the list of dependencies. that said, I do see what looks likes a total list of files in RECORD....this could be useful for wheels, but for eggs PKG-INFO still doesn't contain a list of dependencies. I think trying the top_level.txt is still the right way to go here.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, eggs are pretty oldschool/inconsistent/a bit horrid.
No worries, maybe not as bulletproof, but reading top_level.txt is still an improvement and as you say, if it gives you extra information needed - it sounds like a good pragmatic choice.

// required code. Combining the sizes of each of these directories should give the total size.
var size int64
if i-1 >= 0 && contents[i-1].Name() == packageName {
packagePath := filepath.Join(pythonPath, packageName)
size = pkgutil.GetSize(packagePath)
} else if i+1 < len(contents) && contents[i+1].Name() == packageName+".py" {
size = contents[i+1].Size()
topLevelReader, err := os.Open(filepath.Join(pythonPath, fileName, "top_level.txt"))
if err == nil {
scanner := bufio.NewScanner(topLevelReader)
scanner.Split(bufio.ScanLines)
for scanner.Scan() {
// check if directory exists first, then retrieve size
contentPath := filepath.Join(pythonPath, scanner.Text())
if _, err := os.Stat(contentPath); err == nil {
size = size + pkgutil.GetSize(contentPath)
} else if _, err := os.Stat(contentPath + ".py"); err == nil {
// sometimes the top level content is just a single python file; try this too
size = size + pkgutil.GetSize(contentPath+".py")
}
}
} else {
logrus.Errorf("Could not find Python package %s for corresponding metadata info", packageName)
continue
// if we didn't find a top_level.txt, we'll try the previous alphabetical directory entry heuristic
logrus.Infof("unable to use top_level.txt: falling back to previous alphabetical directory entry heuristic...")

// Retrieves size for actual package/script corresponding to each dist-info metadata directory
// by taking the file entry alphabetically before it (for a package) or after it (for a script)
// var size int64
if i-1 >= 0 && contents[i-1].Name() == packageName {
packagePath := filepath.Join(pythonPath, packageName)
size = pkgutil.GetSize(packagePath)
} else if i+1 < len(contents) && contents[i+1].Name() == packageName+".py" {
size = contents[i+1].Size()
} else {
logrus.Errorf("Could not find Python package %s for corresponding metadata info", packageName)
continue
}
}

currPackage := util.PackageInfo{Version: version, Size: size}
mapPath := strings.Replace(pythonPath, path, "", 1)
addToMap(packages, packageName, mapPath, currPackage)
Expand Down
15 changes: 0 additions & 15 deletions tests/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,6 @@ func TestDiffAndAnalysis(t *testing.T) {
differFlags: []string{"--type=apt", "--no-cache"},
expectedFile: "apt_diff_expected.json",
},
// {
// description: "rpm differ",
// subcommand: "diff",
// imageA: rpmBase,
// imageB: rpmModified,
// differFlags: []string{"--type=rpm"},
// expectedFile: "rpm_diff_expected.json",
// },
{
description: "node differ",
subcommand: "diff",
Expand Down Expand Up @@ -204,13 +196,6 @@ func TestDiffAndAnalysis(t *testing.T) {
differFlags: []string{"--type=apt", "--no-cache"},
expectedFile: "apt_analysis_expected.json",
},
// {
// description: "rpm analysis",
// subcommand: "analyze",
// imageA: rpmModified,
// differFlags: []string{"--type=rpm"},
// expectedFile: "rpm_analysis_expected.json",
// },
{
description: "file sorted analysis",
subcommand: "analyze",
Expand Down
4 changes: 2 additions & 2 deletions tests/pip_analysis_expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"Name": "configobj",
"Path": "/usr/lib/python2.7/dist-packages",
"Version": "5.0.6",
"Size": 89613
"Size": 136871
},
{
"Name": "mercurial",
Expand Down Expand Up @@ -37,7 +37,7 @@
"Name": "setuptools",
"Path": "/usr/local/lib/python3.6/site-packages",
"Version": "36.0.1",
"Size": 837337
"Size": 1282800
},
{
"Name": "six",
Expand Down