Doing some cleaning

This commit is contained in:
Jake Poznanski 2025-09-03 18:41:36 +00:00
parent 94d19c51c6
commit f20f1a0b54

View File

@ -98,10 +98,9 @@ def check_single_page_pdf(pdf_path: Path) -> bool:
def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[DocumentPair]:
"""Find all MD files with corresponding single-page PDF files."""
"""Find all MD files with corresponding PDF files."""
pairs = []
skipped_no_pdf = 0
skipped_multi_page = 0
for md_path in input_dir.rglob("*.md"):
# Check for corresponding PDF
@ -112,18 +111,11 @@ def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[Document
skipped_no_pdf += 1
continue
# Check if PDF has exactly one page
if not check_single_page_pdf(pdf_path):
if verbose:
print(f"Warning: Skipping multi-page PDF {pdf_path}")
skipped_multi_page += 1
continue
relative_path = md_path.relative_to(input_dir)
pairs.append(DocumentPair(md_path, pdf_path, relative_path))
if skipped_no_pdf > 0 or skipped_multi_page > 0:
print(f"Skipped {skipped_no_pdf} files without PDFs and {skipped_multi_page} multi-page PDFs")
if skipped_no_pdf > 0:
print(f"Skipped {skipped_no_pdf} files without PDFs")
return pairs
@ -159,12 +151,13 @@ def clean_document_with_chatgpt(
"You are an expert at cleaning and correcting OCR transcriptions. "
"You will be given an OCR transcription and an image of the original PDF page. "
"Your task is to:\n"
"1. Fix OCR errors and typos\n"
"2. Correct formatting issues\n"
"3. Restore proper punctuation and capitalization\n"
"1. Correct formatting issues.\n"
"2. Preserve the exact spelling of words from the original document.\n"
"3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
"4. Fix word breaks and line breaks\n"
"5. Ensure mathematical formulas and special characters are correct\n"
"6. Maintain the semantic structure of the document\n"
"7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n"
"Return a cleaned version that accurately represents the original document."
)
}
@ -196,7 +189,7 @@ def clean_document_with_chatgpt(
messages=messages, # type: ignore
response_format=CleanedDocument,
temperature=0.2, # Lower temperature for more consistent cleaning
max_tokens=16384
max_tokens=32000,
)
parsed_result = response.choices[0].message.parsed
@ -224,6 +217,10 @@ def process_document(
return True, f"Skipped (already exists): {doc_pair.relative_path}"
try:
# Check if PDF has exactly one page
if not check_single_page_pdf(doc_pair.pdf_path):
return False, f"Skipped multi-page PDF: {doc_pair.pdf_path}"
# Read the markdown content
md_content = doc_pair.md_path.read_text(encoding='utf-8')
@ -241,6 +238,18 @@ def process_document(
# Write cleaned text
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
# Create soft link for the original MD file as .md.orig
orig_md_link_path = output_path.with_suffix('.md.orig')
if orig_md_link_path.exists() or orig_md_link_path.is_symlink():
orig_md_link_path.unlink()
orig_md_link_path.symlink_to(doc_pair.md_path.absolute())
# Create soft link for the PDF file
pdf_link_path = output_dir / doc_pair.relative_path.with_suffix('.pdf')
if pdf_link_path.exists() or pdf_link_path.is_symlink():
pdf_link_path.unlink()
pdf_link_path.symlink_to(doc_pair.pdf_path.absolute())
# Also write metadata
metadata_path = output_path.with_suffix('.json')
metadata = {
@ -280,10 +289,10 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True)
# Find all document pairs (single-page PDFs only)
print(f"Scanning {input_dir} for single-page document pairs...")
# Find all document pairs
print(f"Scanning {input_dir} for document pairs...")
doc_pairs = find_document_pairs(input_dir, args.verbose)
print(f"Found {len(doc_pairs)} valid single-page document pairs.")
print(f"Found {len(doc_pairs)} document pairs (will check page count during processing).")
if not doc_pairs:
print("No document pairs found.")
@ -301,6 +310,7 @@ def main():
# Process documents in batches
successful = 0
failed = 0
skipped_multi_page = 0
with ThreadPoolExecutor(max_workers=args.batch_size) as executor:
futures = []
@ -324,7 +334,10 @@ def main():
if success:
successful += 1
else:
failed += 1
if "multi-page" in message.lower():
skipped_multi_page += 1
else:
failed += 1
if args.verbose:
tqdm.write(message)
@ -332,13 +345,15 @@ def main():
pbar.update(1)
pbar.set_postfix({
'successful': successful,
'skipped': skipped_multi_page,
'failed': failed
})
# Print summary
print(f"\nProcessing complete:")
print(f" Successful: {successful}")
print(f" Failed: {failed}")
print(f" Skipped (multi-page): {skipped_multi_page}")
print(f" Failed (other errors): {failed}")
print(f" Output directory: {output_dir}")