mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
Doing some cleaning
This commit is contained in:
parent
94d19c51c6
commit
f20f1a0b54
@ -98,10 +98,9 @@ def check_single_page_pdf(pdf_path: Path) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[DocumentPair]:
|
def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[DocumentPair]:
|
||||||
"""Find all MD files with corresponding single-page PDF files."""
|
"""Find all MD files with corresponding PDF files."""
|
||||||
pairs = []
|
pairs = []
|
||||||
skipped_no_pdf = 0
|
skipped_no_pdf = 0
|
||||||
skipped_multi_page = 0
|
|
||||||
|
|
||||||
for md_path in input_dir.rglob("*.md"):
|
for md_path in input_dir.rglob("*.md"):
|
||||||
# Check for corresponding PDF
|
# Check for corresponding PDF
|
||||||
@ -112,18 +111,11 @@ def find_document_pairs(input_dir: Path, verbose: bool = False) -> List[Document
|
|||||||
skipped_no_pdf += 1
|
skipped_no_pdf += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if PDF has exactly one page
|
|
||||||
if not check_single_page_pdf(pdf_path):
|
|
||||||
if verbose:
|
|
||||||
print(f"Warning: Skipping multi-page PDF {pdf_path}")
|
|
||||||
skipped_multi_page += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
relative_path = md_path.relative_to(input_dir)
|
relative_path = md_path.relative_to(input_dir)
|
||||||
pairs.append(DocumentPair(md_path, pdf_path, relative_path))
|
pairs.append(DocumentPair(md_path, pdf_path, relative_path))
|
||||||
|
|
||||||
if skipped_no_pdf > 0 or skipped_multi_page > 0:
|
if skipped_no_pdf > 0:
|
||||||
print(f"Skipped {skipped_no_pdf} files without PDFs and {skipped_multi_page} multi-page PDFs")
|
print(f"Skipped {skipped_no_pdf} files without PDFs")
|
||||||
|
|
||||||
return pairs
|
return pairs
|
||||||
|
|
||||||
@ -159,12 +151,13 @@ def clean_document_with_chatgpt(
|
|||||||
"You are an expert at cleaning and correcting OCR transcriptions. "
|
"You are an expert at cleaning and correcting OCR transcriptions. "
|
||||||
"You will be given an OCR transcription and an image of the original PDF page. "
|
"You will be given an OCR transcription and an image of the original PDF page. "
|
||||||
"Your task is to:\n"
|
"Your task is to:\n"
|
||||||
"1. Fix OCR errors and typos\n"
|
"1. Correct formatting issues.\n"
|
||||||
"2. Correct formatting issues\n"
|
"2. Preserve the exact spelling of words from the original document.\n"
|
||||||
"3. Restore proper punctuation and capitalization\n"
|
"3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
|
||||||
"4. Fix word breaks and line breaks\n"
|
"4. Fix word breaks and line breaks\n"
|
||||||
"5. Ensure mathematical formulas and special characters are correct\n"
|
"5. Ensure mathematical formulas and special characters are correct\n"
|
||||||
"6. Maintain the semantic structure of the document\n"
|
"6. Maintain the semantic structure of the document\n"
|
||||||
|
"7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n"
|
||||||
"Return a cleaned version that accurately represents the original document."
|
"Return a cleaned version that accurately represents the original document."
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -196,7 +189,7 @@ def clean_document_with_chatgpt(
|
|||||||
messages=messages, # type: ignore
|
messages=messages, # type: ignore
|
||||||
response_format=CleanedDocument,
|
response_format=CleanedDocument,
|
||||||
temperature=0.2, # Lower temperature for more consistent cleaning
|
temperature=0.2, # Lower temperature for more consistent cleaning
|
||||||
max_tokens=16384
|
max_tokens=32000,
|
||||||
)
|
)
|
||||||
|
|
||||||
parsed_result = response.choices[0].message.parsed
|
parsed_result = response.choices[0].message.parsed
|
||||||
@ -224,6 +217,10 @@ def process_document(
|
|||||||
return True, f"Skipped (already exists): {doc_pair.relative_path}"
|
return True, f"Skipped (already exists): {doc_pair.relative_path}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Check if PDF has exactly one page
|
||||||
|
if not check_single_page_pdf(doc_pair.pdf_path):
|
||||||
|
return False, f"Skipped multi-page PDF: {doc_pair.pdf_path}"
|
||||||
|
|
||||||
# Read the markdown content
|
# Read the markdown content
|
||||||
md_content = doc_pair.md_path.read_text(encoding='utf-8')
|
md_content = doc_pair.md_path.read_text(encoding='utf-8')
|
||||||
|
|
||||||
@ -241,6 +238,18 @@ def process_document(
|
|||||||
# Write cleaned text
|
# Write cleaned text
|
||||||
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
|
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
|
||||||
|
|
||||||
|
# Create soft link for the original MD file as .md.orig
|
||||||
|
orig_md_link_path = output_path.with_suffix('.md.orig')
|
||||||
|
if orig_md_link_path.exists() or orig_md_link_path.is_symlink():
|
||||||
|
orig_md_link_path.unlink()
|
||||||
|
orig_md_link_path.symlink_to(doc_pair.md_path.absolute())
|
||||||
|
|
||||||
|
# Create soft link for the PDF file
|
||||||
|
pdf_link_path = output_dir / doc_pair.relative_path.with_suffix('.pdf')
|
||||||
|
if pdf_link_path.exists() or pdf_link_path.is_symlink():
|
||||||
|
pdf_link_path.unlink()
|
||||||
|
pdf_link_path.symlink_to(doc_pair.pdf_path.absolute())
|
||||||
|
|
||||||
# Also write metadata
|
# Also write metadata
|
||||||
metadata_path = output_path.with_suffix('.json')
|
metadata_path = output_path.with_suffix('.json')
|
||||||
metadata = {
|
metadata = {
|
||||||
@ -280,10 +289,10 @@ def main():
|
|||||||
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Find all document pairs (single-page PDFs only)
|
# Find all document pairs
|
||||||
print(f"Scanning {input_dir} for single-page document pairs...")
|
print(f"Scanning {input_dir} for document pairs...")
|
||||||
doc_pairs = find_document_pairs(input_dir, args.verbose)
|
doc_pairs = find_document_pairs(input_dir, args.verbose)
|
||||||
print(f"Found {len(doc_pairs)} valid single-page document pairs.")
|
print(f"Found {len(doc_pairs)} document pairs (will check page count during processing).")
|
||||||
|
|
||||||
if not doc_pairs:
|
if not doc_pairs:
|
||||||
print("No document pairs found.")
|
print("No document pairs found.")
|
||||||
@ -301,6 +310,7 @@ def main():
|
|||||||
# Process documents in batches
|
# Process documents in batches
|
||||||
successful = 0
|
successful = 0
|
||||||
failed = 0
|
failed = 0
|
||||||
|
skipped_multi_page = 0
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=args.batch_size) as executor:
|
with ThreadPoolExecutor(max_workers=args.batch_size) as executor:
|
||||||
futures = []
|
futures = []
|
||||||
@ -324,7 +334,10 @@ def main():
|
|||||||
if success:
|
if success:
|
||||||
successful += 1
|
successful += 1
|
||||||
else:
|
else:
|
||||||
failed += 1
|
if "multi-page" in message.lower():
|
||||||
|
skipped_multi_page += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
tqdm.write(message)
|
tqdm.write(message)
|
||||||
@ -332,13 +345,15 @@ def main():
|
|||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
pbar.set_postfix({
|
pbar.set_postfix({
|
||||||
'successful': successful,
|
'successful': successful,
|
||||||
|
'skipped': skipped_multi_page,
|
||||||
'failed': failed
|
'failed': failed
|
||||||
})
|
})
|
||||||
|
|
||||||
# Print summary
|
# Print summary
|
||||||
print(f"\nProcessing complete:")
|
print(f"\nProcessing complete:")
|
||||||
print(f" Successful: {successful}")
|
print(f" Successful: {successful}")
|
||||||
print(f" Failed: {failed}")
|
print(f" Skipped (multi-page): {skipped_multi_page}")
|
||||||
|
print(f" Failed (other errors): {failed}")
|
||||||
print(f" Output directory: {output_dir}")
|
print(f" Output directory: {output_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user