mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
Lints
This commit is contained in:
parent
8ef7e56c86
commit
472ee108d7
@ -1,15 +1,16 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import argparse
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from typing import Dict, List, Any, Optional
|
from typing import Any, Dict
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
|
|
||||||
|
|
||||||
def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "gpt-4o") -> Dict[str, Any]:
|
def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "gpt-4o") -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Send a request to GPT-4 asking if the before and after text appear in the same region.
|
Send a request to GPT-4 asking if the before and after text appear in the same region.
|
||||||
@ -41,20 +42,19 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": (f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
|
"text": (
|
||||||
f"Look at the PDF image and determine if these texts are located near each other or in completely "
|
f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
|
||||||
f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n"
|
f"Look at the PDF image and determine if these texts are located near each other or in completely "
|
||||||
f"Before: {before_text}\n\n"
|
f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n"
|
||||||
f"After: {after_text}\n\n"
|
f"Before: {before_text}\n\n"
|
||||||
f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in "
|
f"After: {after_text}\n\n"
|
||||||
f"different regions. Then explain your reasoning in 1-2 sentences.")
|
f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in "
|
||||||
|
f"different regions. Then explain your reasoning in 1-2 sentences."
|
||||||
|
),
|
||||||
},
|
},
|
||||||
{
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
||||||
"type": "image_url",
|
],
|
||||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"}
|
},
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Call the API
|
# Call the API
|
||||||
@ -77,8 +77,8 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
|
|||||||
print(f"Error processing {case.get('id', 'unknown')}: {str(e)}")
|
print(f"Error processing {case.get('id', 'unknown')}: {str(e)}")
|
||||||
return case_with_response
|
return case_with_response
|
||||||
|
|
||||||
def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str,
|
|
||||||
num_workers: int = 8, model: str = "gpt-4o") -> None:
|
def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str, num_workers: int = 8, model: str = "gpt-4o") -> None:
|
||||||
"""
|
"""
|
||||||
Process each line in the JSONL file by sending requests to GPT-4 in parallel.
|
Process each line in the JSONL file by sending requests to GPT-4 in parallel.
|
||||||
|
|
||||||
@ -91,7 +91,7 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
|
|||||||
model: The model to use
|
model: The model to use
|
||||||
"""
|
"""
|
||||||
# Read all test cases from the input file
|
# Read all test cases from the input file
|
||||||
with open(input_file, 'r') as f:
|
with open(input_file, "r") as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
|
|
||||||
# Parse each line to get test cases
|
# Parse each line to get test cases
|
||||||
@ -100,7 +100,6 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
|
|||||||
if line.strip():
|
if line.strip():
|
||||||
test_cases.append(json.loads(line))
|
test_cases.append(json.loads(line))
|
||||||
|
|
||||||
|
|
||||||
# Initialize OpenAI client
|
# Initialize OpenAI client
|
||||||
client = openai.OpenAI(api_key=api_key)
|
client = openai.OpenAI(api_key=api_key)
|
||||||
|
|
||||||
@ -123,31 +122,24 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
|
|||||||
results.append(case)
|
results.append(case)
|
||||||
|
|
||||||
# Filter for cases where GPT-4 responded with "NO"
|
# Filter for cases where GPT-4 responded with "NO"
|
||||||
no_responses = [result for result in results
|
no_responses = [result for result in results if "gpt4_response" in result and result["gpt4_response"].startswith("NO")]
|
||||||
if "gpt4_response" in result
|
|
||||||
and result["gpt4_response"].startswith("NO")]
|
|
||||||
|
|
||||||
# Write filtered results to output file
|
# Write filtered results to output file
|
||||||
with open(output_file, 'w') as f:
|
with open(output_file, "w") as f:
|
||||||
for result in no_responses:
|
for result in no_responses:
|
||||||
f.write(json.dumps(result) + '\n')
|
f.write(json.dumps(result) + "\n")
|
||||||
|
|
||||||
print(f"Processed {len(results)} test cases. Found {len(no_responses)} cases with 'NO' responses. Results written to {output_file}")
|
print(f"Processed {len(results)} test cases. Found {len(no_responses)} cases with 'NO' responses. Results written to {output_file}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Process multi_column.jsonl with GPT-4 to check text regions")
|
parser = argparse.ArgumentParser(description="Process multi_column.jsonl with GPT-4 to check text regions")
|
||||||
parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl",
|
parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", help="Path to input JSONL file")
|
||||||
help="Path to input JSONL file")
|
parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", help="Path to output JSONL file")
|
||||||
parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl",
|
parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", help="Directory containing the PDF files")
|
||||||
help="Path to output JSONL file")
|
parser.add_argument("--workers", type=int, default=8, help="Number of parallel workers")
|
||||||
parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs",
|
parser.add_argument("--model", default="gpt-4.1", help="OpenAI model to use")
|
||||||
help="Directory containing the PDF files")
|
parser.add_argument("--api-key", help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
|
||||||
parser.add_argument("--workers", type=int, default=8,
|
|
||||||
help="Number of parallel workers")
|
|
||||||
parser.add_argument("--model", default="gpt-4.1",
|
|
||||||
help="OpenAI model to use")
|
|
||||||
parser.add_argument("--api-key",
|
|
||||||
help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -160,14 +152,8 @@ def main():
|
|||||||
if not os.path.isdir(args.pdf_dir):
|
if not os.path.isdir(args.pdf_dir):
|
||||||
raise ValueError(f"PDF directory {args.pdf_dir} does not exist")
|
raise ValueError(f"PDF directory {args.pdf_dir} does not exist")
|
||||||
|
|
||||||
process_jsonl_file(
|
process_jsonl_file(input_file=args.input, output_file=args.output, api_key=api_key, pdf_dir=args.pdf_dir, num_workers=args.workers, model=args.model)
|
||||||
input_file=args.input,
|
|
||||||
output_file=args.output,
|
|
||||||
api_key=api_key,
|
|
||||||
pdf_dir=args.pdf_dir,
|
|
||||||
num_workers=args.workers,
|
|
||||||
model=args.model
|
|
||||||
)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
@ -75,6 +75,7 @@ class PIIClassification(BaseModel):
|
|||||||
is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv")
|
is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv")
|
||||||
contains_pii: Optional[bool] = Field(..., description="True if document contains PII")
|
contains_pii: Optional[bool] = Field(..., description="True if document contains PII")
|
||||||
|
|
||||||
|
|
||||||
class RichPIIClassification(BaseModel):
|
class RichPIIClassification(BaseModel):
|
||||||
primary_language: str = Field(..., description="Primary language as a two-letter code")
|
primary_language: str = Field(..., description="Primary language as a two-letter code")
|
||||||
document_type: str = Field(..., description="Basic summary of document type classification")
|
document_type: str = Field(..., description="Basic summary of document type classification")
|
||||||
@ -105,7 +106,14 @@ class RichPIIClassification(BaseModel):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
if self.contains_identifier_name or self.contains_identifier_email or self.contains_identifier_phone_number:
|
if self.contains_identifier_name or self.contains_identifier_email or self.contains_identifier_phone_number:
|
||||||
return self.contains_identifier_with_address or self.contains_identifier_with_biographical_info or self.contains_identifier_with_location_info or self.contains_identifier_with_employment_info or self.contains_identifier_with_education_info or self.contains_identifier_with_medical_info
|
return (
|
||||||
|
self.contains_identifier_with_address
|
||||||
|
or self.contains_identifier_with_biographical_info
|
||||||
|
or self.contains_identifier_with_location_info
|
||||||
|
or self.contains_identifier_with_employment_info
|
||||||
|
or self.contains_identifier_with_education_info
|
||||||
|
or self.contains_identifier_with_medical_info
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -114,8 +122,6 @@ async def _process_single_page(page_text: str) -> RichPIIClassification:
|
|||||||
"""Helper function to process a single document or page."""
|
"""Helper function to process a single document or page."""
|
||||||
text = page_text
|
text = page_text
|
||||||
|
|
||||||
basic_prompt = "Given the text above, determine what type of document it is, and if it's a resume/CV. answer in JSON. The format of your json object should be {'primary_language': str, 'document_type': str, 'is_resume_cv': bool, 'contains_pii': bool}"
|
|
||||||
|
|
||||||
rich_prompt = """You are a document analyzer that identifies Personally Identifiable Information (PII) in documents.
|
rich_prompt = """You are a document analyzer that identifies Personally Identifiable Information (PII) in documents.
|
||||||
Your task is to analyze the provided document image and determine:
|
Your task is to analyze the provided document image and determine:
|
||||||
1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.)
|
1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.)
|
||||||
@ -157,17 +163,14 @@ Only consider actual occurrences of the PII within the document shown."""
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": (
|
"text": (f"{text}\n\n-----------\n" f"{rich_prompt}"),
|
||||||
f"{text}\n\n-----------\n"
|
|
||||||
f"{rich_prompt}"
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"max_tokens": 100,
|
"max_tokens": 100,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichRIIClassification.model_json_schema()}},
|
"response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichPIIClassification.model_json_schema()}},
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"http://localhost:{SERVER_PORT}/v1/chat/completions"
|
url = f"http://localhost:{SERVER_PORT}/v1/chat/completions"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user