This commit is contained in:
Jake Poznanski 2025-04-30 21:18:59 +00:00
parent 8ef7e56c86
commit 472ee108d7
2 changed files with 66 additions and 77 deletions

View File

@ -1,15 +1,16 @@
#!/usr/bin/env python3
import argparse
import json
import os
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Any, Optional
from typing import Any, Dict
import openai
from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png
def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "gpt-4o") -> Dict[str, Any]:
"""
Send a request to GPT-4 asking if the before and after text appear in the same region.
@ -41,20 +42,19 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
"content": [
{
"type": "text",
"text": (f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
"text": (
f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
f"Look at the PDF image and determine if these texts are located near each other or in completely "
f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n"
f"Before: {before_text}\n\n"
f"After: {after_text}\n\n"
f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in "
f"different regions. Then explain your reasoning in 1-2 sentences.")
f"different regions. Then explain your reasoning in 1-2 sentences."
),
},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"}
}
]
}
]
# Call the API
@ -77,8 +77,8 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
print(f"Error processing {case.get('id', 'unknown')}: {str(e)}")
return case_with_response
def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str,
num_workers: int = 8, model: str = "gpt-4o") -> None:
def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str, num_workers: int = 8, model: str = "gpt-4o") -> None:
"""
Process each line in the JSONL file by sending requests to GPT-4 in parallel.
@ -91,7 +91,7 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
model: The model to use
"""
# Read all test cases from the input file
with open(input_file, 'r') as f:
with open(input_file, "r") as f:
lines = f.readlines()
# Parse each line to get test cases
@ -100,7 +100,6 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
if line.strip():
test_cases.append(json.loads(line))
# Initialize OpenAI client
client = openai.OpenAI(api_key=api_key)
@ -123,31 +122,24 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
results.append(case)
# Filter for cases where GPT-4 responded with "NO"
no_responses = [result for result in results
if "gpt4_response" in result
and result["gpt4_response"].startswith("NO")]
no_responses = [result for result in results if "gpt4_response" in result and result["gpt4_response"].startswith("NO")]
# Write filtered results to output file
with open(output_file, 'w') as f:
with open(output_file, "w") as f:
for result in no_responses:
f.write(json.dumps(result) + '\n')
f.write(json.dumps(result) + "\n")
print(f"Processed {len(results)} test cases. Found {len(no_responses)} cases with 'NO' responses. Results written to {output_file}")
def main():
parser = argparse.ArgumentParser(description="Process multi_column.jsonl with GPT-4 to check text regions")
parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl",
help="Path to input JSONL file")
parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl",
help="Path to output JSONL file")
parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs",
help="Directory containing the PDF files")
parser.add_argument("--workers", type=int, default=8,
help="Number of parallel workers")
parser.add_argument("--model", default="gpt-4.1",
help="OpenAI model to use")
parser.add_argument("--api-key",
help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", help="Path to input JSONL file")
parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", help="Path to output JSONL file")
parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", help="Directory containing the PDF files")
parser.add_argument("--workers", type=int, default=8, help="Number of parallel workers")
parser.add_argument("--model", default="gpt-4.1", help="OpenAI model to use")
parser.add_argument("--api-key", help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
args = parser.parse_args()
@ -160,14 +152,8 @@ def main():
if not os.path.isdir(args.pdf_dir):
raise ValueError(f"PDF directory {args.pdf_dir} does not exist")
process_jsonl_file(
input_file=args.input,
output_file=args.output,
api_key=api_key,
pdf_dir=args.pdf_dir,
num_workers=args.workers,
model=args.model
)
process_jsonl_file(input_file=args.input, output_file=args.output, api_key=api_key, pdf_dir=args.pdf_dir, num_workers=args.workers, model=args.model)
if __name__ == "__main__":
main()

View File

@ -75,6 +75,7 @@ class PIIClassification(BaseModel):
is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv")
contains_pii: Optional[bool] = Field(..., description="True if document contains PII")
class RichPIIClassification(BaseModel):
primary_language: str = Field(..., description="Primary language as a two-letter code")
document_type: str = Field(..., description="Basic summary of document type classification")
@ -105,7 +106,14 @@ class RichPIIClassification(BaseModel):
return True
if self.contains_identifier_name or self.contains_identifier_email or self.contains_identifier_phone_number:
return self.contains_identifier_with_address or self.contains_identifier_with_biographical_info or self.contains_identifier_with_location_info or self.contains_identifier_with_employment_info or self.contains_identifier_with_education_info or self.contains_identifier_with_medical_info
return (
self.contains_identifier_with_address
or self.contains_identifier_with_biographical_info
or self.contains_identifier_with_location_info
or self.contains_identifier_with_employment_info
or self.contains_identifier_with_education_info
or self.contains_identifier_with_medical_info
)
else:
return False
@ -114,8 +122,6 @@ async def _process_single_page(page_text: str) -> RichPIIClassification:
"""Helper function to process a single document or page."""
text = page_text
basic_prompt = "Given the text above, determine what type of document it is, and if it's a resume/CV. answer in JSON. The format of your json object should be {'primary_language': str, 'document_type': str, 'is_resume_cv': bool, 'contains_pii': bool}"
rich_prompt = """You are a document analyzer that identifies Personally Identifiable Information (PII) in documents.
Your task is to analyze the provided document image and determine:
1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.)
@ -157,17 +163,14 @@ Only consider actual occurrences of the PII within the document shown."""
"content": [
{
"type": "text",
"text": (
f"{text}\n\n-----------\n"
f"{rich_prompt}"
),
"text": (f"{text}\n\n-----------\n" f"{rich_prompt}"),
}
],
}
],
"max_tokens": 100,
"temperature": 0.0,
"response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichRIIClassification.model_json_schema()}},
"response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichPIIClassification.model_json_schema()}},
}
url = f"http://localhost:{SERVER_PORT}/v1/chat/completions"