Doing some cleaning

This commit is contained in:
Jake Poznanski 2025-09-03 18:41:36 +00:00
parent 94d19c51c6
commit f20f1a0b54

View File

@ -98,10 +98,9 @@ def check_single_page_pdf(pdf_path: Path) -> bool:
def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[DocumentPair]: def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[DocumentPair]:
"""Find all MD files with corresponding single-page PDF files.""" """Find all MD files with corresponding PDF files."""
pairs = [] pairs = []
skipped_no_pdf = 0 skipped_no_pdf = 0
skipped_multi_page = 0
for md_path in input_dir.rglob("*.md"): for md_path in input_dir.rglob("*.md"):
# Check for corresponding PDF # Check for corresponding PDF
@ -112,18 +111,11 @@ def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[Document
skipped_no_pdf += 1 skipped_no_pdf += 1
continue continue
# Check if PDF has exactly one page
if not check_single_page_pdf(pdf_path):
if verbose:
print(f"Warning: Skipping multi-page PDF {pdf_path}")
skipped_multi_page += 1
continue
relative_path = md_path.relative_to(input_dir) relative_path = md_path.relative_to(input_dir)
pairs.append(DocumentPair(md_path, pdf_path, relative_path)) pairs.append(DocumentPair(md_path, pdf_path, relative_path))
if skipped_no_pdf > 0 or skipped_multi_page > 0: if skipped_no_pdf > 0:
print(f"Skipped {skipped_no_pdf} files without PDFs and {skipped_multi_page} multi-page PDFs") print(f"Skipped {skipped_no_pdf} files without PDFs")
return pairs return pairs
@ -159,12 +151,13 @@ def clean_document_with_chatgpt(
"You are an expert at cleaning and correcting OCR transcriptions. " "You are an expert at cleaning and correcting OCR transcriptions. "
"You will be given an OCR transcription and an image of the original PDF page. " "You will be given an OCR transcription and an image of the original PDF page. "
"Your task is to:\n" "Your task is to:\n"
"1. Fix OCR errors and typos\n" "1. Correct formatting issues.\n"
"2. Correct formatting issues\n" "2. Preserve the exact spelling of words from the original document.\n"
"3. Restore proper punctuation and capitalization\n" "3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
"4. Fix word breaks and line breaks\n" "4. Fix word breaks and line breaks\n"
"5. Ensure mathematical formulas and special characters are correct\n" "5. Ensure mathematical formulas and special characters are correct\n"
"6. Maintain the semantic structure of the document\n" "6. Maintain the semantic structure of the document\n"
"7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n"
"Return a cleaned version that accurately represents the original document." "Return a cleaned version that accurately represents the original document."
) )
} }
@ -196,7 +189,7 @@ def clean_document_with_chatgpt(
messages=messages, # type: ignore messages=messages, # type: ignore
response_format=CleanedDocument, response_format=CleanedDocument,
temperature=0.2, # Lower temperature for more consistent cleaning temperature=0.2, # Lower temperature for more consistent cleaning
max_tokens=16384 max_tokens=32000,
) )
parsed_result = response.choices[0].message.parsed parsed_result = response.choices[0].message.parsed
@ -224,6 +217,10 @@ def process_document(
return True, f"Skipped (already exists): {doc_pair.relative_path}" return True, f"Skipped (already exists): {doc_pair.relative_path}"
try: try:
# Check if PDF has exactly one page
if not check_single_page_pdf(doc_pair.pdf_path):
return False, f"Skipped multi-page PDF: {doc_pair.pdf_path}"
# Read the markdown content # Read the markdown content
md_content = doc_pair.md_path.read_text(encoding='utf-8') md_content = doc_pair.md_path.read_text(encoding='utf-8')
@ -241,6 +238,18 @@ def process_document(
# Write cleaned text # Write cleaned text
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8') output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
# Create soft link for the original MD file as .md.orig
orig_md_link_path = output_path.with_suffix('.md.orig')
if orig_md_link_path.exists() or orig_md_link_path.is_symlink():
orig_md_link_path.unlink()
orig_md_link_path.symlink_to(doc_pair.md_path.absolute())
# Create soft link for the PDF file
pdf_link_path = output_dir / doc_pair.relative_path.with_suffix('.pdf')
if pdf_link_path.exists() or pdf_link_path.is_symlink():
pdf_link_path.unlink()
pdf_link_path.symlink_to(doc_pair.pdf_path.absolute())
# Also write metadata # Also write metadata
metadata_path = output_path.with_suffix('.json') metadata_path = output_path.with_suffix('.json')
metadata = { metadata = {
@ -280,10 +289,10 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# Find all document pairs (single-page PDFs only) # Find all document pairs
print(f"Scanning {input_dir} for single-page document pairs...") print(f"Scanning {input_dir} for document pairs...")
doc_pairs = find_document_pairs(input_dir, args.verbose) doc_pairs = find_document_pairs(input_dir, args.verbose)
print(f"Found {len(doc_pairs)} valid single-page document pairs.") print(f"Found {len(doc_pairs)} document pairs (will check page count during processing).")
if not doc_pairs: if not doc_pairs:
print("No document pairs found.") print("No document pairs found.")
@ -301,6 +310,7 @@ def main():
# Process documents in batches # Process documents in batches
successful = 0 successful = 0
failed = 0 failed = 0
skipped_multi_page = 0
with ThreadPoolExecutor(max_workers=args.batch_size) as executor: with ThreadPoolExecutor(max_workers=args.batch_size) as executor:
futures = [] futures = []
@ -324,7 +334,10 @@ def main():
if success: if success:
successful += 1 successful += 1
else: else:
failed += 1 if "multi-page" in message.lower():
skipped_multi_page += 1
else:
failed += 1
if args.verbose: if args.verbose:
tqdm.write(message) tqdm.write(message)
@ -332,13 +345,15 @@ def main():
pbar.update(1) pbar.update(1)
pbar.set_postfix({ pbar.set_postfix({
'successful': successful, 'successful': successful,
'skipped': skipped_multi_page,
'failed': failed 'failed': failed
}) })
# Print summary # Print summary
print(f"\nProcessing complete:") print(f"\nProcessing complete:")
print(f" Successful: {successful}") print(f" Successful: {successful}")
print(f" Failed: {failed}") print(f" Skipped (multi-page): {skipped_multi_page}")
print(f" Failed (other errors): {failed}")
print(f" Output directory: {output_dir}") print(f" Output directory: {output_dir}")