Skip to content
This repository was archived by the owner on Mar 27, 2024. It is now read-only.

Commit 5b5e627

Browse files
authored
Merge pull request #292 from nkubala/pip_metadata
Use PKG-INFO and METADATA to infer package names in pip analysis
2 parents 8fc5ae4 + 61bf169 commit 5b5e627

File tree

2 files changed

+105
-43
lines changed

2 files changed

+105
-43
lines changed

differs/pip_diff.go

+86-42
Original file line numberDiff line numberDiff line change
@@ -87,53 +87,97 @@ func (a PipAnalyzer) getPackages(image pkgutil.Image) (map[string]map[string]uti
8787
for i := 0; i < len(contents); i++ {
8888
c := contents[i]
8989
fileName := c.Name()
90-
// check if package
91-
packageDir := regexp.MustCompile("^([a-z|A-Z|0-9|_]+)-(([0-9]+?\\.){2,3})(dist-info|egg-info)$")
92-
packageMatch := packageDir.FindStringSubmatch(fileName)
93-
if len(packageMatch) != 0 {
94-
packageName := packageMatch[1]
95-
version := packageMatch[2][:len(packageMatch[2])-1]
96-
97-
// First, try and use the "top_level.txt",
98-
// Many egg packages contains a "top_level.txt" file describing the directories containing the
99-
// required code. Combining the sizes of each of these directories should give the total size.
100-
var size int64
101-
topLevelReader, err := os.Open(filepath.Join(pythonPath, fileName, "top_level.txt"))
102-
if err == nil {
103-
scanner := bufio.NewScanner(topLevelReader)
104-
scanner.Split(bufio.ScanLines)
105-
for scanner.Scan() {
106-
// check if directory exists first, then retrieve size
107-
contentPath := filepath.Join(pythonPath, scanner.Text())
108-
if _, err := os.Stat(contentPath); err == nil {
109-
size = size + pkgutil.GetSize(contentPath)
110-
} else if _, err := os.Stat(contentPath + ".py"); err == nil {
111-
// sometimes the top level content is just a single python file; try this too
112-
size = size + pkgutil.GetSize(contentPath+".py")
113-
}
90+
var metadata *os.File
91+
var err error
92+
if strings.HasSuffix(fileName, "egg-info") {
93+
// wheel directory
94+
metadata, err = os.Open(filepath.Join(pythonPath, fileName, "PKG-INFO"))
95+
if err != nil {
96+
logrus.Debugf("unable to open PKG-INFO for egg %s", fileName)
97+
}
98+
} else if strings.HasSuffix(fileName, "dist-info") {
99+
// egg directory
100+
metadata, err = os.Open(filepath.Join(pythonPath, fileName, "METADATA"))
101+
if err != nil {
102+
logrus.Debugf("unable to open METADATA for wheel %s", fileName)
103+
}
104+
} else {
105+
// no match
106+
continue
107+
}
108+
109+
var line, packageName, version string
110+
if metadata == nil {
111+
// unable to open metadata file: try reading the package itself
112+
mPath := filepath.Join(pythonPath, fileName)
113+
metadata, err = os.Open(mPath)
114+
fInfo, _ := os.Stat(mPath)
115+
if err != nil || fInfo.IsDir() {
116+
// if this also doesn't work, the package doesn't have the correct metadata structure
117+
// try and parse the name using a regex anyway
118+
logrus.Debugf("failed to locate package metadata: attempting to infer package name")
119+
packageDir := regexp.MustCompile("^([a-z|A-Z|0-9|_]+)-(([0-9]+?\\.){2,3})(dist-info|egg-info)$")
120+
packageMatch := packageDir.FindStringSubmatch(fileName)
121+
if len(packageMatch) != 0 {
122+
packageName = packageMatch[1]
123+
version = packageMatch[2][:len(packageMatch[2])-1]
114124
}
115-
} else {
116-
// if we didn't find a top_level.txt, we'll try the previous alphabetical directory entry heuristic
117-
logrus.Infof("unable to use top_level.txt: falling back to previous alphabetical directory entry heuristic...")
118-
119-
// Retrieves size for actual package/script corresponding to each dist-info metadata directory
120-
// by taking the file entry alphabetically before it (for a package) or after it (for a script)
121-
// var size int64
122-
if i-1 >= 0 && contents[i-1].Name() == packageName {
123-
packagePath := filepath.Join(pythonPath, packageName)
124-
size = pkgutil.GetSize(packagePath)
125-
} else if i+1 < len(contents) && contents[i+1].Name() == packageName+".py" {
126-
size = contents[i+1].Size()
127-
} else {
128-
logrus.Errorf("Could not find Python package %s for corresponding metadata info", packageName)
129-
continue
125+
}
126+
}
127+
128+
if metadata != nil {
129+
scanner := bufio.NewScanner(metadata)
130+
scanner.Split(bufio.ScanLines)
131+
for scanner.Scan() {
132+
line = scanner.Text()
133+
if strings.HasPrefix(line, "Name") {
134+
packageName = strings.Split(line, ": ")[1]
135+
// next line is always the version
136+
scanner.Scan()
137+
version = strings.Split(scanner.Text(), ": ")[1]
138+
break
130139
}
131140
}
141+
}
132142

133-
currPackage := util.PackageInfo{Version: version, Size: size}
134-
mapPath := strings.Replace(pythonPath, path, "", 1)
135-
addToMap(packages, packageName, mapPath, currPackage)
143+
// First, try and use the "top_level.txt",
144+
// Many egg packages contains a "top_level.txt" file describing the directories containing the
145+
// required code. Combining the sizes of each of these directories should give the total size.
146+
var size int64
147+
topLevelReader, err := os.Open(filepath.Join(pythonPath, fileName, "top_level.txt"))
148+
if err == nil {
149+
scanner := bufio.NewScanner(topLevelReader)
150+
scanner.Split(bufio.ScanLines)
151+
for scanner.Scan() {
152+
// check if directory exists first, then retrieve size
153+
contentPath := filepath.Join(pythonPath, scanner.Text())
154+
if _, err := os.Stat(contentPath); err == nil {
155+
size = size + pkgutil.GetSize(contentPath)
156+
} else if _, err := os.Stat(contentPath + ".py"); err == nil {
157+
// sometimes the top level content is just a single python file; try this too
158+
size = size + pkgutil.GetSize(contentPath+".py")
159+
}
160+
}
161+
} else {
162+
logrus.Debugf("unable to use top_level.txt: falling back to alphabetical directory entry heuristic...")
163+
164+
// Retrieves size for actual package/script corresponding to each dist-info metadata directory
165+
// by examining the file entries directly before and after it
166+
if i-1 >= 0 && strings.Contains(contents[i-1].Name(), packageName) {
167+
packagePath := filepath.Join(pythonPath, contents[i-1].Name())
168+
size = pkgutil.GetSize(packagePath)
169+
} else if i+1 < len(contents) && strings.Contains(contents[i+1].Name(), packageName) {
170+
packagePath := filepath.Join(pythonPath, contents[i+1].Name())
171+
size = pkgutil.GetSize(packagePath)
172+
} else {
173+
logrus.Errorf("failed to locate python package for corresponding package metadata %s", packageName)
174+
continue
175+
}
136176
}
177+
178+
currPackage := util.PackageInfo{Version: version, Size: size}
179+
mapPath := strings.Replace(pythonPath, path, "", 1)
180+
addToMap(packages, packageName, mapPath, currPackage)
137181
}
138182
}
139183

tests/pip_analysis_expected.json

+19-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,18 @@
33
"Image": "gcr.io/gcp-runtimes/pip-modified",
44
"AnalyzeType": "Pip",
55
"Analysis": [
6+
{
7+
"Name": "argparse",
8+
"Path": "/usr/lib/python2.7",
9+
"Version": "1.2.1",
10+
"Size": 89124
11+
},
12+
{
13+
"Name": "bzr",
14+
"Path": "/usr/lib/python2.7/dist-packages",
15+
"Version": "2.7.0dev1",
16+
"Size": 13063022
17+
},
618
{
719
"Name": "configobj",
820
"Path": "/usr/lib/python2.7/dist-packages",
@@ -56,7 +68,13 @@
5668
"Path": "/usr/local/lib/python3.6/site-packages",
5769
"Version": "0.29.0",
5870
"Size": 103509
71+
},
72+
{
73+
"Name": "wsgiref",
74+
"Path": "/usr/lib/python2.7",
75+
"Version": "0.1.2",
76+
"Size": 101007
5977
}
6078
]
6179
}
62-
]
80+
]

0 commit comments

Comments
 (0)