repo-snapshot/repo_snapshot.py
2026-03-30 18:47:42 +02:00

268 lines
7.8 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import datetime as dt
import json
import os
import re
import subprocess
import sys
from typing import Dict, List, Tuple
IGNORED_DIRS = {
".git",
"node_modules",
"dist",
"build",
".venv",
"venv",
"__pycache__",
".mypy_cache",
".pytest_cache",
}
LANGUAGE_BY_EXT = {
".py": "Python",
".js": "JavaScript",
".ts": "TypeScript",
".tsx": "TypeScript",
".jsx": "JavaScript",
".go": "Go",
".rs": "Rust",
".java": "Java",
".kt": "Kotlin",
".rb": "Ruby",
".php": "PHP",
".cs": "C#",
".c": "C",
".h": "C",
".cpp": "C++",
".hpp": "C++",
".swift": "Swift",
".m": "Objective-C",
".sh": "Shell",
".yml": "YAML",
".yaml": "YAML",
".json": "JSON",
".toml": "TOML",
".md": "Markdown",
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Summarize a repository into JSON.")
parser.add_argument("path", nargs="?", default=".", help="Path to the repository.")
parser.add_argument("--max-files", type=int, default=2000, help="Max files to scan.")
parser.add_argument("--max-depth", type=int, default=6, help="Max directory depth.")
return parser.parse_args()
def is_ignored_dir(name: str) -> bool:
return name in IGNORED_DIRS
def detect_language(ext: str) -> str:
return LANGUAGE_BY_EXT.get(ext, ext[1:].upper() if ext else "UNKNOWN")
def read_json(path: str):
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return None
def read_text(path: str) -> str:
try:
with open(path, "r", encoding="utf-8") as f:
return f.read()
except Exception:
return ""
def parse_requirements(text: str) -> List[str]:
deps = []
for line in text.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
deps.append(line)
return deps
def parse_go_mod(text: str) -> List[str]:
deps = []
for line in text.splitlines():
line = line.strip()
if line.startswith("require "):
parts = line.split()
if len(parts) >= 2:
deps.append(parts[1])
elif line and not line.startswith("//") and not line.startswith("module ") and not line.startswith("go "):
if re.match(r"^[a-zA-Z0-9_.\-/]+\s+v", line):
deps.append(line.split()[0])
return deps
def parse_pyproject(text: str) -> List[str]:
try:
import tomllib # type: ignore
except Exception:
return []
try:
data = tomllib.loads(text)
except Exception:
return []
deps: List[str] = []
project = data.get("project") or {}
deps.extend(project.get("dependencies") or [])
deps.extend((project.get("optional-dependencies") or {}).values())
tool = data.get("tool") or {}
poetry = tool.get("poetry") or {}
deps.extend(list((poetry.get("dependencies") or {}).keys()))
dev = poetry.get("dev-dependencies") or {}
deps.extend(list(dev.keys()))
flat: List[str] = []
for item in deps:
if isinstance(item, list):
flat.extend(item)
elif isinstance(item, str):
flat.append(item)
return [d for d in flat if d and isinstance(d, str)]
def detect_tests(root: str, package_json: dict) -> List[str]:
indicators = []
for name in ["tests", "test", "__tests__"]:
if os.path.isdir(os.path.join(root, name)):
indicators.append(f"dir:{name}")
for file_name in ["pytest.ini", "tox.ini", "jest.config.js", "jest.config.ts"]:
if os.path.isfile(os.path.join(root, file_name)):
indicators.append(f"file:{file_name}")
scripts = (package_json or {}).get("scripts") or {}
if "test" in scripts:
indicators.append("npm_script:test")
return indicators
def detect_entry_points(root: str, package_json: dict) -> List[str]:
entry_points = []
if package_json:
if "main" in package_json:
entry_points.append(f"npm_main:{package_json['main']}")
bin_field = package_json.get("bin")
if isinstance(bin_field, dict):
for name, val in bin_field.items():
entry_points.append(f"npm_bin:{name}={val}")
elif isinstance(bin_field, str):
entry_points.append(f"npm_bin:{bin_field}")
if os.path.isfile(os.path.join(root, "__main__.py")):
entry_points.append("python:__main__.py")
return entry_points
def git_info(root: str) -> Dict[str, str]:
if not os.path.isdir(os.path.join(root, ".git")):
return {}
info = {}
try:
branch = subprocess.check_output(
["git", "-C", root, "rev-parse", "--abbrev-ref", "HEAD"],
stderr=subprocess.DEVNULL,
).decode().strip()
info["branch"] = branch
except Exception:
pass
try:
commit = subprocess.check_output(
["git", "-C", root, "rev-parse", "HEAD"],
stderr=subprocess.DEVNULL,
).decode().strip()
info["commit"] = commit
except Exception:
pass
return info
def walk_repo(root: str, max_files: int, max_depth: int) -> Tuple[int, int, int, Dict[str, int]]:
file_count = 0
dir_count = 0
total_bytes = 0
langs: Dict[str, int] = {}
for current, dirs, files in os.walk(root):
rel = os.path.relpath(current, root)
depth = 0 if rel == "." else rel.count(os.sep) + 1
if depth > max_depth:
dirs[:] = []
continue
dirs[:] = [d for d in dirs if not is_ignored_dir(d)]
dir_count += 1
for name in files:
file_count += 1
if file_count > max_files:
return file_count, dir_count, total_bytes, langs
path = os.path.join(current, name)
try:
size = os.path.getsize(path)
except Exception:
size = 0
total_bytes += size
_, ext = os.path.splitext(name)
lang = detect_language(ext.lower())
langs[lang] = langs.get(lang, 0) + 1
return file_count, dir_count, total_bytes, langs
def main() -> int:
args = parse_args()
root = os.path.abspath(args.path)
max_files = max(args.max_files, 1)
max_depth = max(args.max_depth, 0)
package_json = read_json(os.path.join(root, "package.json")) or {}
pyproject_text = read_text(os.path.join(root, "pyproject.toml"))
requirements_text = read_text(os.path.join(root, "requirements.txt"))
go_mod_text = read_text(os.path.join(root, "go.mod"))
file_count, dir_count, total_bytes, langs = walk_repo(root, max_files, max_depth)
deps = {
"npm": sorted(list((package_json.get("dependencies") or {}).keys())),
"npm_dev": sorted(list((package_json.get("devDependencies") or {}).keys())),
"python": sorted(parse_requirements(requirements_text)),
"python_pyproject": sorted(parse_pyproject(pyproject_text)),
"go": sorted(parse_go_mod(go_mod_text)),
}
tests = detect_tests(root, package_json)
entry_points = detect_entry_points(root, package_json)
output = {
"root": root,
"generated_at": dt.datetime.now(dt.timezone.utc).isoformat().replace("+00:00", "Z"),
"stats": {
"file_count": file_count,
"dir_count": dir_count,
"total_bytes": total_bytes,
"max_files": max_files,
"max_depth": max_depth,
},
"languages": dict(sorted(langs.items(), key=lambda x: (-x[1], x[0]))),
"dependencies": deps,
"tests": tests,
"entry_points": entry_points,
"git": git_info(root),
}
json.dump(output, sys.stdout, indent=2, sort_keys=False)
sys.stdout.write("\n")
return 0
if __name__ == "__main__":
raise SystemExit(main())