mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-11-22 21:16:55 +00:00
159 lines
4.6 KiB
Python
159 lines
4.6 KiB
Python
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt)
|
|
# Source for "Build a Reasoning Model (From Scratch)": https://mng.bz/lZ5B
|
|
# Code repository: https://github.com/rasbt/reasoning-from-scratch
|
|
|
|
# Verify that Python source files (and optionally notebooks) use double quotes for strings.
|
|
|
|
import argparse
|
|
import ast
|
|
import io
|
|
import json
|
|
import sys
|
|
import tokenize
|
|
from pathlib import Path
|
|
|
|
EXCLUDED_DIRS = {
|
|
".git",
|
|
".hg",
|
|
".mypy_cache",
|
|
".pytest_cache",
|
|
".ruff_cache",
|
|
".svn",
|
|
".tox",
|
|
".venv",
|
|
"__pycache__",
|
|
"build",
|
|
"dist",
|
|
"node_modules",
|
|
}
|
|
|
|
PREFIX_CHARS = {"r", "u", "f", "b"}
|
|
SINGLE_QUOTE = "'"
|
|
DOUBLE_QUOTE = "\""
|
|
TRIPLE_SINGLE = SINGLE_QUOTE * 3
|
|
TRIPLE_DOUBLE = DOUBLE_QUOTE * 3
|
|
|
|
|
|
def should_skip(path):
|
|
parts = set(path.parts)
|
|
return bool(EXCLUDED_DIRS & parts)
|
|
|
|
|
|
def collect_fstring_expr_string_positions(source):
|
|
"""
|
|
Return set of (lineno, col_offset) for string literals that appear inside
|
|
formatted expressions of f-strings. These should be exempt from the double
|
|
quote check, since enforcing double quotes there is unnecessarily strict.
|
|
"""
|
|
try:
|
|
tree = ast.parse(source)
|
|
except SyntaxError:
|
|
return set()
|
|
|
|
positions = set()
|
|
|
|
class Collector(ast.NodeVisitor):
|
|
def visit_JoinedStr(self, node):
|
|
for value in node.values:
|
|
if isinstance(value, ast.FormattedValue):
|
|
self._collect_from_expr(value.value)
|
|
# Continue walking to catch nested f-strings within expressions
|
|
self.generic_visit(node)
|
|
|
|
def _collect_from_expr(self, node):
|
|
if isinstance(node, ast.Constant) and isinstance(node.value, str):
|
|
positions.add((node.lineno, node.col_offset))
|
|
elif isinstance(node, ast.Str): # Python <3.8 compatibility
|
|
positions.add((node.lineno, node.col_offset))
|
|
else:
|
|
for child in ast.iter_child_nodes(node):
|
|
self._collect_from_expr(child)
|
|
|
|
Collector().visit(tree)
|
|
return positions
|
|
|
|
|
|
def check_quotes_in_source(source, path):
|
|
violations = []
|
|
ignored_positions = collect_fstring_expr_string_positions(source)
|
|
tokens = tokenize.generate_tokens(io.StringIO(source).readline)
|
|
for tok_type, tok_str, start, _, _ in tokens:
|
|
if tok_type == tokenize.STRING:
|
|
if start in ignored_positions:
|
|
continue
|
|
lowered = tok_str.lower()
|
|
# ignore triple-quoted strings
|
|
if lowered.startswith((TRIPLE_DOUBLE, TRIPLE_SINGLE)):
|
|
continue
|
|
|
|
# find the prefix and quote type
|
|
# prefix = ""
|
|
for c in PREFIX_CHARS:
|
|
if lowered.startswith(c):
|
|
# prefix = c
|
|
lowered = lowered[1:]
|
|
break
|
|
|
|
# report if not using double quotes
|
|
if lowered.startswith(SINGLE_QUOTE):
|
|
line, col = start
|
|
violations.append(f"{path}:{line}:{col}: uses single quotes")
|
|
return violations
|
|
|
|
|
|
def check_file(path):
|
|
try:
|
|
if path.suffix == ".ipynb":
|
|
return check_notebook(path)
|
|
else:
|
|
text = path.read_text(encoding="utf-8")
|
|
return check_quotes_in_source(text, path)
|
|
except Exception as e:
|
|
return [f"{path}: failed to check ({e})"]
|
|
|
|
|
|
def check_notebook(path):
|
|
violations = []
|
|
with open(path, encoding="utf-8") as f:
|
|
nb = json.load(f)
|
|
for cell in nb.get("cells", []):
|
|
if cell.get("cell_type") == "code":
|
|
src = "".join(cell.get("source", []))
|
|
violations.extend(check_quotes_in_source(src, path))
|
|
return violations
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Verify double-quoted string literals.")
|
|
parser.add_argument(
|
|
"--include-notebooks",
|
|
action="store_true",
|
|
help="Also scan Jupyter notebooks (.ipynb files) for single-quoted strings.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
project_root = Path(".").resolve()
|
|
py_files = sorted(project_root.rglob("*.py"))
|
|
notebook_files = sorted(project_root.rglob("*.ipynb")) if args.include_notebooks else []
|
|
|
|
violations = []
|
|
for path in py_files + notebook_files:
|
|
if should_skip(path):
|
|
continue
|
|
violations.extend(check_file(path))
|
|
|
|
if violations:
|
|
print("\n".join(violations))
|
|
print(f"\n{len(violations)} violations found.")
|
|
return 1
|
|
|
|
print("All files use double quotes correctly.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|