#!/usr/bin/env python3 import argparse import datetime as dt import json import os import re import subprocess import sys from typing import Dict, List, Tuple IGNORED_DIRS = { ".git", "node_modules", "dist", "build", ".venv", "venv", "__pycache__", ".mypy_cache", ".pytest_cache", } LANGUAGE_BY_EXT = { ".py": "Python", ".js": "JavaScript", ".ts": "TypeScript", ".tsx": "TypeScript", ".jsx": "JavaScript", ".go": "Go", ".rs": "Rust", ".java": "Java", ".kt": "Kotlin", ".rb": "Ruby", ".php": "PHP", ".cs": "C#", ".c": "C", ".h": "C", ".cpp": "C++", ".hpp": "C++", ".swift": "Swift", ".m": "Objective-C", ".sh": "Shell", ".yml": "YAML", ".yaml": "YAML", ".json": "JSON", ".toml": "TOML", ".md": "Markdown", } def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Summarize a repository into JSON.") parser.add_argument("path", nargs="?", default=".", help="Path to the repository.") parser.add_argument("--max-files", type=int, default=2000, help="Max files to scan.") parser.add_argument("--max-depth", type=int, default=6, help="Max directory depth.") return parser.parse_args() def is_ignored_dir(name: str) -> bool: return name in IGNORED_DIRS def detect_language(ext: str) -> str: return LANGUAGE_BY_EXT.get(ext, ext[1:].upper() if ext else "UNKNOWN") def read_json(path: str): try: with open(path, "r", encoding="utf-8") as f: return json.load(f) except Exception: return None def read_text(path: str) -> str: try: with open(path, "r", encoding="utf-8") as f: return f.read() except Exception: return "" def parse_requirements(text: str) -> List[str]: deps = [] for line in text.splitlines(): line = line.strip() if not line or line.startswith("#"): continue deps.append(line) return deps def parse_go_mod(text: str) -> List[str]: deps = [] for line in text.splitlines(): line = line.strip() if line.startswith("require "): parts = line.split() if len(parts) >= 2: deps.append(parts[1]) elif line and not line.startswith("//") and not line.startswith("module ") and not line.startswith("go "): if re.match(r"^[a-zA-Z0-9_.\-/]+\s+v", line): deps.append(line.split()[0]) return deps def parse_pyproject(text: str) -> List[str]: try: import tomllib # type: ignore except Exception: return [] try: data = tomllib.loads(text) except Exception: return [] deps: List[str] = [] project = data.get("project") or {} deps.extend(project.get("dependencies") or []) deps.extend((project.get("optional-dependencies") or {}).values()) tool = data.get("tool") or {} poetry = tool.get("poetry") or {} deps.extend(list((poetry.get("dependencies") or {}).keys())) dev = poetry.get("dev-dependencies") or {} deps.extend(list(dev.keys())) flat: List[str] = [] for item in deps: if isinstance(item, list): flat.extend(item) elif isinstance(item, str): flat.append(item) return [d for d in flat if d and isinstance(d, str)] def detect_tests(root: str, package_json: dict) -> List[str]: indicators = [] for name in ["tests", "test", "__tests__"]: if os.path.isdir(os.path.join(root, name)): indicators.append(f"dir:{name}") for file_name in ["pytest.ini", "tox.ini", "jest.config.js", "jest.config.ts"]: if os.path.isfile(os.path.join(root, file_name)): indicators.append(f"file:{file_name}") scripts = (package_json or {}).get("scripts") or {} if "test" in scripts: indicators.append("npm_script:test") return indicators def detect_entry_points(root: str, package_json: dict) -> List[str]: entry_points = [] if package_json: if "main" in package_json: entry_points.append(f"npm_main:{package_json['main']}") bin_field = package_json.get("bin") if isinstance(bin_field, dict): for name, val in bin_field.items(): entry_points.append(f"npm_bin:{name}={val}") elif isinstance(bin_field, str): entry_points.append(f"npm_bin:{bin_field}") if os.path.isfile(os.path.join(root, "__main__.py")): entry_points.append("python:__main__.py") return entry_points def git_info(root: str) -> Dict[str, str]: if not os.path.isdir(os.path.join(root, ".git")): return {} info = {} try: branch = subprocess.check_output( ["git", "-C", root, "rev-parse", "--abbrev-ref", "HEAD"], stderr=subprocess.DEVNULL, ).decode().strip() info["branch"] = branch except Exception: pass try: commit = subprocess.check_output( ["git", "-C", root, "rev-parse", "HEAD"], stderr=subprocess.DEVNULL, ).decode().strip() info["commit"] = commit except Exception: pass return info def walk_repo(root: str, max_files: int, max_depth: int) -> Tuple[int, int, int, Dict[str, int]]: file_count = 0 dir_count = 0 total_bytes = 0 langs: Dict[str, int] = {} for current, dirs, files in os.walk(root): rel = os.path.relpath(current, root) depth = 0 if rel == "." else rel.count(os.sep) + 1 if depth > max_depth: dirs[:] = [] continue dirs[:] = [d for d in dirs if not is_ignored_dir(d)] dir_count += 1 for name in files: file_count += 1 if file_count > max_files: return file_count, dir_count, total_bytes, langs path = os.path.join(current, name) try: size = os.path.getsize(path) except Exception: size = 0 total_bytes += size _, ext = os.path.splitext(name) lang = detect_language(ext.lower()) langs[lang] = langs.get(lang, 0) + 1 return file_count, dir_count, total_bytes, langs def main() -> int: args = parse_args() root = os.path.abspath(args.path) max_files = max(args.max_files, 1) max_depth = max(args.max_depth, 0) package_json = read_json(os.path.join(root, "package.json")) or {} pyproject_text = read_text(os.path.join(root, "pyproject.toml")) requirements_text = read_text(os.path.join(root, "requirements.txt")) go_mod_text = read_text(os.path.join(root, "go.mod")) file_count, dir_count, total_bytes, langs = walk_repo(root, max_files, max_depth) deps = { "npm": sorted(list((package_json.get("dependencies") or {}).keys())), "npm_dev": sorted(list((package_json.get("devDependencies") or {}).keys())), "python": sorted(parse_requirements(requirements_text)), "python_pyproject": sorted(parse_pyproject(pyproject_text)), "go": sorted(parse_go_mod(go_mod_text)), } tests = detect_tests(root, package_json) entry_points = detect_entry_points(root, package_json) output = { "root": root, "generated_at": dt.datetime.now(dt.timezone.utc).isoformat().replace("+00:00", "Z"), "stats": { "file_count": file_count, "dir_count": dir_count, "total_bytes": total_bytes, "max_files": max_files, "max_depth": max_depth, }, "languages": dict(sorted(langs.items(), key=lambda x: (-x[1], x[0]))), "dependencies": deps, "tests": tests, "entry_points": entry_points, "git": git_info(root), } json.dump(output, sys.stdout, indent=2, sort_keys=False) sys.stdout.write("\n") return 0 if __name__ == "__main__": raise SystemExit(main())