I had to package a nontrivial Python codebase, and I needed to put dependencies
in setup.py
.
I could do git grep -h import | sort -u
, then review the output by hand, but
I lacked the motivation for it. Much better to take a stab at solving the
general problem
The result is at https://github.com/spanezz/python-devel-tools.
One fun part is scanning a directory tree, using ast
to find import
statements scattered around the code:
class Scanner:
def __init__(self):
self.names: Set[str] = set()
def scan_dir(self, root: str):
for dirpath, dirnames, filenames, dir_fd in os.fwalk(root):
for fn in filenames:
if fn.endswith(".py"):
with dirfd_open(fn, dir_fd=dir_fd) as fd:
self.scan_file(fd, os.path.join(dirpath, fn))
st = os.stat(fn, dir_fd=dir_fd)
if st.st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH):
with dirfd_open(fn, dir_fd=dir_fd) as fd:
try:
lead = fd.readline()
except UnicodeDecodeError:
continue
if re_python_shebang.match(lead):
fd.seek(0)
self.scan_file(fd, os.path.join(dirpath, fn))
def scan_file(self, fd: TextIO, pathname: str):
log.info("Reading file %s", pathname)
try:
tree = ast.parse(fd.read(), pathname)
except SyntaxError as e:
log.warning("%s: file cannot be parsed", pathname, exc_info=e)
return
self.scan_tree(tree)
def scan_tree(self, tree: ast.AST):
for stm in tree.body:
if isinstance(stm, ast.Import):
for alias in stm.names:
if not isinstance(alias.name, str):
print("NAME", repr(alias.name), stm)
self.names.add(alias.name)
elif isinstance(stm, ast.ImportFrom):
if stm.module is not None:
self.names.add(stm.module)
elif hasattr(stm, "body"):
self.scan_tree(stm)
Another fun part is grouping the imported module names by where in sys.path they have been found:
scanner = Scanner()
scanner.scan_dir(args.dir)
sys.path.append(args.dir)
by_sys_path: Dict[str, List[str]] = collections.defaultdict(list)
for name in sorted(scanner.names):
spec = importlib.util.find_spec(name)
if spec is None or spec.origin is None:
by_sys_path[""].append(name)
else:
for sp in sys.path:
if spec.origin.startswith(sp):
by_sys_path[sp].append(name)
break
else:
by_sys_path[spec.origin].append(name)
for sys_path, names in sorted(by_sys_path.items()):
print(f"{sys_path or 'unidentified'}:")
for name in names:
print(f" {name}")
An example. It's kind of nice how it can at least tell apart stdlib modules so one doesn't need to read through those:
$ ./scan-imports …/himblick
unidentified:
changemonitor
chroot
cmdline
mediadir
player
server
settings
static
syncer
utils
…/himblick:
himblib.cmdline
himblib.host_setup
himblib.player
himblib.sd
/usr/lib/python3.9:
__future__
argparse
asyncio
collections
configparser
contextlib
datetime
io
json
logging
mimetypes
os
pathlib
re
secrets
shlex
shutil
signal
subprocess
tempfile
textwrap
typing
/usr/lib/python3/dist-packages:
asyncssh
parted
progressbar
pyinotify
setuptools
tornado
tornado.escape
tornado.httpserver
tornado.ioloop
tornado.netutil
tornado.web
tornado.websocket
yaml
built-in:
sys
time
Maybe such a tool already exists and works much better than this? From a quick search I didn't find it, and it was fun to (re)invent it.
Updates:
Jakub Wilk pointed out to an old python-modules script that finds Debian dependencies.
The AST scanning code should be refactored to use ast.NodeVisitor.