From f20f1a0b5488a8c99688b83bf98a218ad76b068a Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 3 Sep 2025 18:41:36 +0000 Subject: [PATCH] Doing some cleaning --- scripts/clean_olmocrmix.py | 55 ++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/scripts/clean_olmocrmix.py b/scripts/clean_olmocrmix.py index 17343c3..d403332 100755 --- a/scripts/clean_olmocrmix.py +++ b/scripts/clean_olmocrmix.py @@ -98,10 +98,9 @@ def check_single_page_pdf(pdf_path: Path) -> bool: def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[DocumentPair]: - """Find all MD files with corresponding single-page PDF files.""" + """Find all MD files with corresponding PDF files.""" pairs = [] skipped_no_pdf = 0 - skipped_multi_page = 0 for md_path in input_dir.rglob("*.md"): # Check for corresponding PDF @@ -112,18 +111,11 @@ def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[Document skipped_no_pdf += 1 continue - # Check if PDF has exactly one page - if not check_single_page_pdf(pdf_path): - if verbose: - print(f"Warning: Skipping multi-page PDF {pdf_path}") - skipped_multi_page += 1 - continue - relative_path = md_path.relative_to(input_dir) pairs.append(DocumentPair(md_path, pdf_path, relative_path)) - if skipped_no_pdf > 0 or skipped_multi_page > 0: - print(f"Skipped {skipped_no_pdf} files without PDFs and {skipped_multi_page} multi-page PDFs") + if skipped_no_pdf > 0: + print(f"Skipped {skipped_no_pdf} files without PDFs") return pairs @@ -159,12 +151,13 @@ def clean_document_with_chatgpt( "You are an expert at cleaning and correcting OCR transcriptions. " "You will be given an OCR transcription and an image of the original PDF page. " "Your task is to:\n" - "1. Fix OCR errors and typos\n" - "2. Correct formatting issues\n" - "3. Restore proper punctuation and capitalization\n" + "1. Correct formatting issues.\n" + "2. Preserve the exact spelling of words from the original document.\n" + "3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n" "4. Fix word breaks and line breaks\n" "5. Ensure mathematical formulas and special characters are correct\n" "6. Maintain the semantic structure of the document\n" + "7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n" "Return a cleaned version that accurately represents the original document." ) } @@ -196,7 +189,7 @@ def clean_document_with_chatgpt( messages=messages, # type: ignore response_format=CleanedDocument, temperature=0.2, # Lower temperature for more consistent cleaning - max_tokens=16384 + max_tokens=32000, ) parsed_result = response.choices[0].message.parsed @@ -224,6 +217,10 @@ def process_document( return True, f"Skipped (already exists): {doc_pair.relative_path}" try: + # Check if PDF has exactly one page + if not check_single_page_pdf(doc_pair.pdf_path): + return False, f"Skipped multi-page PDF: {doc_pair.pdf_path}" + # Read the markdown content md_content = doc_pair.md_path.read_text(encoding='utf-8') @@ -241,6 +238,18 @@ def process_document( # Write cleaned text output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8') + # Create soft link for the original MD file as .md.orig + orig_md_link_path = output_path.with_suffix('.md.orig') + if orig_md_link_path.exists() or orig_md_link_path.is_symlink(): + orig_md_link_path.unlink() + orig_md_link_path.symlink_to(doc_pair.md_path.absolute()) + + # Create soft link for the PDF file + pdf_link_path = output_dir / doc_pair.relative_path.with_suffix('.pdf') + if pdf_link_path.exists() or pdf_link_path.is_symlink(): + pdf_link_path.unlink() + pdf_link_path.symlink_to(doc_pair.pdf_path.absolute()) + # Also write metadata metadata_path = output_path.with_suffix('.json') metadata = { @@ -280,10 +289,10 @@ def main(): output_dir.mkdir(parents=True, exist_ok=True) - # Find all document pairs (single-page PDFs only) - print(f"Scanning {input_dir} for single-page document pairs...") + # Find all document pairs + print(f"Scanning {input_dir} for document pairs...") doc_pairs = find_document_pairs(input_dir, args.verbose) - print(f"Found {len(doc_pairs)} valid single-page document pairs.") + print(f"Found {len(doc_pairs)} document pairs (will check page count during processing).") if not doc_pairs: print("No document pairs found.") @@ -301,6 +310,7 @@ def main(): # Process documents in batches successful = 0 failed = 0 + skipped_multi_page = 0 with ThreadPoolExecutor(max_workers=args.batch_size) as executor: futures = [] @@ -324,7 +334,10 @@ def main(): if success: successful += 1 else: - failed += 1 + if "multi-page" in message.lower(): + skipped_multi_page += 1 + else: + failed += 1 if args.verbose: tqdm.write(message) @@ -332,13 +345,15 @@ def main(): pbar.update(1) pbar.set_postfix({ 'successful': successful, + 'skipped': skipped_multi_page, 'failed': failed }) # Print summary print(f"\nProcessing complete:") print(f" Successful: {successful}") - print(f" Failed: {failed}") + print(f" Skipped (multi-page): {skipped_multi_page}") + print(f" Failed (other errors): {failed}") print(f" Output directory: {output_dir}")