This commit is contained in:
Jake Poznanski 2025-04-30 21:18:59 +00:00
parent 8ef7e56c86
commit 472ee108d7
2 changed files with 66 additions and 77 deletions

View File

@ -1,26 +1,27 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse
import json import json
import os import os
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Any, Optional from typing import Any, Dict
import openai import openai
from tqdm import tqdm from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png
def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "gpt-4o") -> Dict[str, Any]: def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "gpt-4o") -> Dict[str, Any]:
""" """
Send a request to GPT-4 asking if the before and after text appear in the same region. Send a request to GPT-4 asking if the before and after text appear in the same region.
Include the PDF image in the prompt. Include the PDF image in the prompt.
Args: Args:
case: A test case from the JSONL file case: A test case from the JSONL file
client: The OpenAI client client: The OpenAI client
pdf_dir: Directory containing PDF files pdf_dir: Directory containing PDF files
model: The model to use model: The model to use
Returns: Returns:
The original case with the added response field The original case with the added response field
""" """
@ -28,11 +29,11 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
after_text = case["after"] after_text = case["after"]
pdf_path = os.path.join(pdf_dir, case["pdf"]) pdf_path = os.path.join(pdf_dir, case["pdf"])
page_num = case["page"] page_num = case["page"]
try: try:
# Render the PDF page to a base64-encoded PNG image # Render the PDF page to a base64-encoded PNG image
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num) image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num)
# Create messages with both text and image # Create messages with both text and image
messages = [ messages = [
{"role": "system", "content": "You are an AI assistant analyzing text from PDFs."}, {"role": "system", "content": "You are an AI assistant analyzing text from PDFs."},
@ -40,23 +41,22 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": (f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? " "text": (
f"Look at the PDF image and determine if these texts are located near each other or in completely " f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n" f"Look at the PDF image and determine if these texts are located near each other or in completely "
f"Before: {before_text}\n\n" f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n"
f"After: {after_text}\n\n" f"Before: {before_text}\n\n"
f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in " f"After: {after_text}\n\n"
f"different regions. Then explain your reasoning in 1-2 sentences.") f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in "
f"different regions. Then explain your reasoning in 1-2 sentences."
),
}, },
{ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
"type": "image_url", ],
"image_url": {"url": f"data:image/png;base64,{image_base64}"} },
}
]
}
] ]
# Call the API # Call the API
response = client.chat.completions.create( response = client.chat.completions.create(
model=model, model=model,
@ -64,7 +64,7 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
temperature=0.0, temperature=0.0,
max_tokens=300, max_tokens=300,
) )
# Add GPT-4's response to the case # Add GPT-4's response to the case
case_with_response = case.copy() case_with_response = case.copy()
case_with_response["gpt4_response"] = response.choices[0].message.content case_with_response["gpt4_response"] = response.choices[0].message.content
@ -77,11 +77,11 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
print(f"Error processing {case.get('id', 'unknown')}: {str(e)}") print(f"Error processing {case.get('id', 'unknown')}: {str(e)}")
return case_with_response return case_with_response
def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str,
num_workers: int = 8, model: str = "gpt-4o") -> None: def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str, num_workers: int = 8, model: str = "gpt-4o") -> None:
""" """
Process each line in the JSONL file by sending requests to GPT-4 in parallel. Process each line in the JSONL file by sending requests to GPT-4 in parallel.
Args: Args:
input_file: Path to the input JSONL file input_file: Path to the input JSONL file
output_file: Path to write the output JSONL file with responses output_file: Path to write the output JSONL file with responses
@ -91,25 +91,24 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
model: The model to use model: The model to use
""" """
# Read all test cases from the input file # Read all test cases from the input file
with open(input_file, 'r') as f: with open(input_file, "r") as f:
lines = f.readlines() lines = f.readlines()
# Parse each line to get test cases # Parse each line to get test cases
test_cases = [] test_cases = []
for line in lines: for line in lines:
if line.strip(): if line.strip():
test_cases.append(json.loads(line)) test_cases.append(json.loads(line))
# Initialize OpenAI client # Initialize OpenAI client
client = openai.OpenAI(api_key=api_key) client = openai.OpenAI(api_key=api_key)
# Process test cases in parallel # Process test cases in parallel
results = [] results = []
with ThreadPoolExecutor(max_workers=num_workers) as executor: with ThreadPoolExecutor(max_workers=num_workers) as executor:
# Submit all tasks # Submit all tasks
future_to_case = {executor.submit(process_test_case, case, client, pdf_dir, model): case for case in test_cases} future_to_case = {executor.submit(process_test_case, case, client, pdf_dir, model): case for case in test_cases}
# Process results as they complete # Process results as they complete
for future in tqdm(as_completed(future_to_case), total=len(test_cases), desc="Processing test cases"): for future in tqdm(as_completed(future_to_case), total=len(test_cases), desc="Processing test cases"):
try: try:
@ -121,53 +120,40 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
# Add failed case with error message # Add failed case with error message
case["gpt4_response"] = f"PROCESSING_ERROR: {str(e)}" case["gpt4_response"] = f"PROCESSING_ERROR: {str(e)}"
results.append(case) results.append(case)
# Filter for cases where GPT-4 responded with "NO" # Filter for cases where GPT-4 responded with "NO"
no_responses = [result for result in results no_responses = [result for result in results if "gpt4_response" in result and result["gpt4_response"].startswith("NO")]
if "gpt4_response" in result
and result["gpt4_response"].startswith("NO")]
# Write filtered results to output file # Write filtered results to output file
with open(output_file, 'w') as f: with open(output_file, "w") as f:
for result in no_responses: for result in no_responses:
f.write(json.dumps(result) + '\n') f.write(json.dumps(result) + "\n")
print(f"Processed {len(results)} test cases. Found {len(no_responses)} cases with 'NO' responses. Results written to {output_file}") print(f"Processed {len(results)} test cases. Found {len(no_responses)} cases with 'NO' responses. Results written to {output_file}")
def main(): def main():
parser = argparse.ArgumentParser(description="Process multi_column.jsonl with GPT-4 to check text regions") parser = argparse.ArgumentParser(description="Process multi_column.jsonl with GPT-4 to check text regions")
parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", help="Path to input JSONL file")
help="Path to input JSONL file") parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", help="Path to output JSONL file")
parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", help="Directory containing the PDF files")
help="Path to output JSONL file") parser.add_argument("--workers", type=int, default=8, help="Number of parallel workers")
parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", parser.add_argument("--model", default="gpt-4.1", help="OpenAI model to use")
help="Directory containing the PDF files") parser.add_argument("--api-key", help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
parser.add_argument("--workers", type=int, default=8,
help="Number of parallel workers")
parser.add_argument("--model", default="gpt-4.1",
help="OpenAI model to use")
parser.add_argument("--api-key",
help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
args = parser.parse_args() args = parser.parse_args()
# Get API key from arguments or environment variable # Get API key from arguments or environment variable
api_key = args.api_key or os.environ.get("OPENAI_API_KEY") api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
if not api_key: if not api_key:
raise ValueError("OpenAI API key must be provided either via --api-key or OPENAI_API_KEY environment variable") raise ValueError("OpenAI API key must be provided either via --api-key or OPENAI_API_KEY environment variable")
# Verify that the PDF directory exists # Verify that the PDF directory exists
if not os.path.isdir(args.pdf_dir): if not os.path.isdir(args.pdf_dir):
raise ValueError(f"PDF directory {args.pdf_dir} does not exist") raise ValueError(f"PDF directory {args.pdf_dir} does not exist")
process_jsonl_file( process_jsonl_file(input_file=args.input, output_file=args.output, api_key=api_key, pdf_dir=args.pdf_dir, num_workers=args.workers, model=args.model)
input_file=args.input,
output_file=args.output,
api_key=api_key,
pdf_dir=args.pdf_dir,
num_workers=args.workers,
model=args.model
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -75,6 +75,7 @@ class PIIClassification(BaseModel):
is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv") is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv")
contains_pii: Optional[bool] = Field(..., description="True if document contains PII") contains_pii: Optional[bool] = Field(..., description="True if document contains PII")
class RichPIIClassification(BaseModel): class RichPIIClassification(BaseModel):
primary_language: str = Field(..., description="Primary language as a two-letter code") primary_language: str = Field(..., description="Primary language as a two-letter code")
document_type: str = Field(..., description="Basic summary of document type classification") document_type: str = Field(..., description="Basic summary of document type classification")
@ -100,12 +101,19 @@ class RichPIIClassification(BaseModel):
def contains_any_pii(self) -> bool: def contains_any_pii(self) -> bool:
if self.is_public_document: if self.is_public_document:
return False return False
if self.contains_pii_government_id or self.contains_pii_financial_info or self.contains_pii_biometric_data or self.contains_pii_login_info: if self.contains_pii_government_id or self.contains_pii_financial_info or self.contains_pii_biometric_data or self.contains_pii_login_info:
return True return True
if self.contains_identifier_name or self.contains_identifier_email or self.contains_identifier_phone_number: if self.contains_identifier_name or self.contains_identifier_email or self.contains_identifier_phone_number:
return self.contains_identifier_with_address or self.contains_identifier_with_biographical_info or self.contains_identifier_with_location_info or self.contains_identifier_with_employment_info or self.contains_identifier_with_education_info or self.contains_identifier_with_medical_info return (
self.contains_identifier_with_address
or self.contains_identifier_with_biographical_info
or self.contains_identifier_with_location_info
or self.contains_identifier_with_employment_info
or self.contains_identifier_with_education_info
or self.contains_identifier_with_medical_info
)
else: else:
return False return False
@ -114,8 +122,6 @@ async def _process_single_page(page_text: str) -> RichPIIClassification:
"""Helper function to process a single document or page.""" """Helper function to process a single document or page."""
text = page_text text = page_text
basic_prompt = "Given the text above, determine what type of document it is, and if it's a resume/CV. answer in JSON. The format of your json object should be {'primary_language': str, 'document_type': str, 'is_resume_cv': bool, 'contains_pii': bool}"
rich_prompt = """You are a document analyzer that identifies Personally Identifiable Information (PII) in documents. rich_prompt = """You are a document analyzer that identifies Personally Identifiable Information (PII) in documents.
Your task is to analyze the provided document image and determine: Your task is to analyze the provided document image and determine:
1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.) 1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.)
@ -157,17 +163,14 @@ Only consider actual occurrences of the PII within the document shown."""
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": ( "text": (f"{text}\n\n-----------\n" f"{rich_prompt}"),
f"{text}\n\n-----------\n"
f"{rich_prompt}"
),
} }
], ],
} }
], ],
"max_tokens": 100, "max_tokens": 100,
"temperature": 0.0, "temperature": 0.0,
"response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichRIIClassification.model_json_schema()}}, "response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichPIIClassification.model_json_schema()}},
} }
url = f"http://localhost:{SERVER_PORT}/v1/chat/completions" url = f"http://localhost:{SERVER_PORT}/v1/chat/completions"
@ -186,7 +189,7 @@ Only consider actual occurrences of the PII within the document shown."""
logger.warning(f"Server HTTP {status}: {body[:250]!r}") logger.warning(f"Server HTTP {status}: {body[:250]!r}")
metrics.add_metrics(server_errors=1) metrics.add_metrics(server_errors=1)
return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False) return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
# ---------- Parse base JSON -------------------------------------------- # ---------- Parse base JSON --------------------------------------------
try: try:
base = json.loads(body) base = json.loads(body)
@ -209,12 +212,12 @@ Only consider actual occurrences of the PII within the document shown."""
logger.warning(f"Missing fields in Server response: {e!s}") logger.warning(f"Missing fields in Server response: {e!s}")
metrics.add_metrics(server_errors=1) metrics.add_metrics(server_errors=1)
return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False) return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
if not isinstance(content, str): if not isinstance(content, str):
logger.warning("Server `content` is not a string; treating as error.") logger.warning("Server `content` is not a string; treating as error.")
metrics.add_metrics(server_errors=1) metrics.add_metrics(server_errors=1)
return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False) return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
try: try:
pii_classification: RichPIIClassification = RichPIIClassification.model_validate_json(content) pii_classification: RichPIIClassification = RichPIIClassification.model_validate_json(content)
return pii_classification return pii_classification
@ -222,7 +225,7 @@ Only consider actual occurrences of the PII within the document shown."""
logger.warning(f"Unable to parse pii classification object: {e!s}") logger.warning(f"Unable to parse pii classification object: {e!s}")
metrics.add_metrics(server_errors=1) metrics.add_metrics(server_errors=1)
return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False) return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
# Manual simple implementation of HTTP Post # Manual simple implementation of HTTP Post
# It feels strange perhaps, but httpx and aiohttp are very complex beasts # It feels strange perhaps, but httpx and aiohttp are very complex beasts