from pathlib import Path
import pathspec
import re
from repository.models import File
def load_ignore_files(base: Path, filenames: list[str]) -> pathspec.PathSpec:
patterns = []
for name in filenames:
path = base / name
if not path.exists():
continue
patterns.extend(path.read_text().splitlines())
return pathspec.PathSpec.from_lines(
pathspec.patterns.GitWildMatchPattern,
patterns
)
def strip_comments(content: str, file_extension: str) -> str:
"""Remove comments from content based on file type"""
if not content:
return content
extension = file_extension.lower()
if extension in ['.py', '.sh', '.yml', '.yaml']:
content = re.sub(r'^\s*#.*$', '', content, flags=re.MULTILINE)
elif extension in ['.js']:
content = re.sub(r'^\s*//.*$', '', content, flags=re.MULTILINE)
content = re.sub(r'/\*[\s\S]*?\*/', '', content)
elif extension == '.css':
content = re.sub(r'/\*[\s\S]*?\*/', '', content)
elif extension in ['.html', '.htm', '.xml', '.xhtml', '.svg']:
content = re.sub(r'<!--[\s\S]*?-->', '', content)
content = '\n'.join(line.rstrip() for line in content.splitlines())
return content
def load_single_file(file_path: Path, base: Path) -> None:
rel_path = file_path.relative_to(base).as_posix()
is_directory = file_path.is_dir()
content = ""
if not is_directory:
try:
content = file_path.read_text(encoding="utf-8")
extension = file_path.suffix.lower()
content = strip_comments(content, extension)
except UnicodeDecodeError:
content = ""
File.objects.update_or_create(
path=rel_path,
defaults={
"name": file_path.name,
"content": content,
"is_directory": is_directory,
},
)
print(f"Loaded: {rel_path}")
def load_files() -> None:
repo_root = Path(".").resolve()
base = repo_root / "src"
ignore = load_ignore_files(
repo_root,
[".gitignore", ".repoignore"],
)
found_paths = set()
for file_path in base.rglob("*"):
rel = file_path.relative_to(base).as_posix()
if ignore.match_file(f"src/{rel}"):
continue
load_single_file(file_path, base)
found_paths.add(rel)
db_paths = set(File.objects.values_list('path', flat=True))
paths_to_delete = db_paths - found_paths
if paths_to_delete:
File.objects.filter(path__in=paths_to_delete).delete()
print(f"Deleted {len(paths_to_delete)} files: {paths_to_delete}")