diff --git a/differs/pip_diff.go b/differs/pip_diff.go index ed5683db..dae84c26 100644 --- a/differs/pip_diff.go +++ b/differs/pip_diff.go @@ -87,53 +87,97 @@ func (a PipAnalyzer) getPackages(image pkgutil.Image) (map[string]map[string]uti for i := 0; i < len(contents); i++ { c := contents[i] fileName := c.Name() - // check if package - packageDir := regexp.MustCompile("^([a-z|A-Z|0-9|_]+)-(([0-9]+?\\.){2,3})(dist-info|egg-info)$") - packageMatch := packageDir.FindStringSubmatch(fileName) - if len(packageMatch) != 0 { - packageName := packageMatch[1] - version := packageMatch[2][:len(packageMatch[2])-1] - - // First, try and use the "top_level.txt", - // Many egg packages contains a "top_level.txt" file describing the directories containing the - // required code. Combining the sizes of each of these directories should give the total size. - var size int64 - topLevelReader, err := os.Open(filepath.Join(pythonPath, fileName, "top_level.txt")) - if err == nil { - scanner := bufio.NewScanner(topLevelReader) - scanner.Split(bufio.ScanLines) - for scanner.Scan() { - // check if directory exists first, then retrieve size - contentPath := filepath.Join(pythonPath, scanner.Text()) - if _, err := os.Stat(contentPath); err == nil { - size = size + pkgutil.GetSize(contentPath) - } else if _, err := os.Stat(contentPath + ".py"); err == nil { - // sometimes the top level content is just a single python file; try this too - size = size + pkgutil.GetSize(contentPath+".py") - } + var metadata *os.File + var err error + if strings.HasSuffix(fileName, "egg-info") { + // wheel directory + metadata, err = os.Open(filepath.Join(pythonPath, fileName, "PKG-INFO")) + if err != nil { + logrus.Debugf("unable to open PKG-INFO for egg %s", fileName) + } + } else if strings.HasSuffix(fileName, "dist-info") { + // egg directory + metadata, err = os.Open(filepath.Join(pythonPath, fileName, "METADATA")) + if err != nil { + logrus.Debugf("unable to open METADATA for wheel %s", fileName) + } + } else { + // no match + continue + } + + var line, packageName, version string + if metadata == nil { + // unable to open metadata file: try reading the package itself + mPath := filepath.Join(pythonPath, fileName) + metadata, err = os.Open(mPath) + fInfo, _ := os.Stat(mPath) + if err != nil || fInfo.IsDir() { + // if this also doesn't work, the package doesn't have the correct metadata structure + // try and parse the name using a regex anyway + logrus.Debugf("failed to locate package metadata: attempting to infer package name") + packageDir := regexp.MustCompile("^([a-z|A-Z|0-9|_]+)-(([0-9]+?\\.){2,3})(dist-info|egg-info)$") + packageMatch := packageDir.FindStringSubmatch(fileName) + if len(packageMatch) != 0 { + packageName = packageMatch[1] + version = packageMatch[2][:len(packageMatch[2])-1] } - } else { - // if we didn't find a top_level.txt, we'll try the previous alphabetical directory entry heuristic - logrus.Infof("unable to use top_level.txt: falling back to previous alphabetical directory entry heuristic...") - - // Retrieves size for actual package/script corresponding to each dist-info metadata directory - // by taking the file entry alphabetically before it (for a package) or after it (for a script) - // var size int64 - if i-1 >= 0 && contents[i-1].Name() == packageName { - packagePath := filepath.Join(pythonPath, packageName) - size = pkgutil.GetSize(packagePath) - } else if i+1 < len(contents) && contents[i+1].Name() == packageName+".py" { - size = contents[i+1].Size() - } else { - logrus.Errorf("Could not find Python package %s for corresponding metadata info", packageName) - continue + } + } + + if metadata != nil { + scanner := bufio.NewScanner(metadata) + scanner.Split(bufio.ScanLines) + for scanner.Scan() { + line = scanner.Text() + if strings.HasPrefix(line, "Name") { + packageName = strings.Split(line, ": ")[1] + // next line is always the version + scanner.Scan() + version = strings.Split(scanner.Text(), ": ")[1] + break } } + } - currPackage := util.PackageInfo{Version: version, Size: size} - mapPath := strings.Replace(pythonPath, path, "", 1) - addToMap(packages, packageName, mapPath, currPackage) + // First, try and use the "top_level.txt", + // Many egg packages contains a "top_level.txt" file describing the directories containing the + // required code. Combining the sizes of each of these directories should give the total size. + var size int64 + topLevelReader, err := os.Open(filepath.Join(pythonPath, fileName, "top_level.txt")) + if err == nil { + scanner := bufio.NewScanner(topLevelReader) + scanner.Split(bufio.ScanLines) + for scanner.Scan() { + // check if directory exists first, then retrieve size + contentPath := filepath.Join(pythonPath, scanner.Text()) + if _, err := os.Stat(contentPath); err == nil { + size = size + pkgutil.GetSize(contentPath) + } else if _, err := os.Stat(contentPath + ".py"); err == nil { + // sometimes the top level content is just a single python file; try this too + size = size + pkgutil.GetSize(contentPath+".py") + } + } + } else { + logrus.Debugf("unable to use top_level.txt: falling back to alphabetical directory entry heuristic...") + + // Retrieves size for actual package/script corresponding to each dist-info metadata directory + // by examining the file entries directly before and after it + if i-1 >= 0 && strings.Contains(contents[i-1].Name(), packageName) { + packagePath := filepath.Join(pythonPath, contents[i-1].Name()) + size = pkgutil.GetSize(packagePath) + } else if i+1 < len(contents) && strings.Contains(contents[i+1].Name(), packageName) { + packagePath := filepath.Join(pythonPath, contents[i+1].Name()) + size = pkgutil.GetSize(packagePath) + } else { + logrus.Errorf("failed to locate python package for corresponding package metadata %s", packageName) + continue + } } + + currPackage := util.PackageInfo{Version: version, Size: size} + mapPath := strings.Replace(pythonPath, path, "", 1) + addToMap(packages, packageName, mapPath, currPackage) } } diff --git a/tests/pip_analysis_expected.json b/tests/pip_analysis_expected.json index 1e1f8b65..cbff9856 100644 --- a/tests/pip_analysis_expected.json +++ b/tests/pip_analysis_expected.json @@ -3,6 +3,18 @@ "Image": "gcr.io/gcp-runtimes/pip-modified", "AnalyzeType": "Pip", "Analysis": [ + { + "Name": "argparse", + "Path": "/usr/lib/python2.7", + "Version": "1.2.1", + "Size": 89124 + }, + { + "Name": "bzr", + "Path": "/usr/lib/python2.7/dist-packages", + "Version": "2.7.0dev1", + "Size": 13063022 + }, { "Name": "configobj", "Path": "/usr/lib/python2.7/dist-packages", @@ -56,7 +68,13 @@ "Path": "/usr/local/lib/python3.6/site-packages", "Version": "0.29.0", "Size": 103509 + }, + { + "Name": "wsgiref", + "Path": "/usr/lib/python2.7", + "Version": "0.1.2", + "Size": 101007 } ] } -] \ No newline at end of file +]