Lints

2025-10-11 08:12:22 +00:00 · 2025-04-30 21:18:59 +00:00 · 2025-04-30 21:18:59 +00:00 · 472ee108d7
commit 472ee108d7
parent 8ef7e56c86
2 changed files with 66 additions and 77 deletions
--- a/olmocr/bench/miners/check_multicolumn.py
+++ b/olmocr/bench/miners/check_multicolumn.py
@ -1,26 +1,27 @@
 #!/usr/bin/env python3
 import argparse
 import json
 import os
 import argparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, List, Any, Optional
+from typing import Any, Dict
 import openai
 from tqdm import tqdm
 from olmocr.data.renderpdf import render_pdf_to_base64png
 def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "gpt-4o") -> Dict[str, Any]:
    """
    Send a request to GPT-4 asking if the before and after text appear in the same region.
    Include the PDF image in the prompt.
-    
+
    Args:
        case: A test case from the JSONL file
        client: The OpenAI client
        pdf_dir: Directory containing PDF files
        model: The model to use
-        
+
    Returns:
        The original case with the added response field
    """
@ -28,11 +29,11 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
    after_text = case["after"]
    pdf_path = os.path.join(pdf_dir, case["pdf"])
    page_num = case["page"]
-    
+
    try:
        # Render the PDF page to a base64-encoded PNG image
        image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num)
-        
+
        # Create messages with both text and image
        messages = [
            {"role": "system", "content": "You are an AI assistant analyzing text from PDFs."},
@ -40,23 +41,22 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
                "role": "user",
                "content": [
                    {
-                        "type": "text", 
+                        "type": "text",
-                        "text": (f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
+                        "text": (
-                                f"Look at the PDF image and determine if these texts are located near each other or in completely "
+                            f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
-                                f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n"
+                            f"Look at the PDF image and determine if these texts are located near each other or in completely "
-                                f"Before: {before_text}\n\n"
+                            f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n"
-                                f"After: {after_text}\n\n"
+                            f"Before: {before_text}\n\n"
-                                f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in "
+                            f"After: {after_text}\n\n"
-                                f"different regions. Then explain your reasoning in 1-2 sentences.")
+                            f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in "
                            f"different regions. Then explain your reasoning in 1-2 sentences."
                        ),
                    },
-                    {
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-                        "type": "image_url", 
+                ],
-                        "image_url": {"url": f"data:image/png;base64,{image_base64}"}
+            },
                    }
                ]
            }
        ]
-        
+
        # Call the API
        response = client.chat.completions.create(
            model=model,
@ -64,7 +64,7 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
            temperature=0.0,
            max_tokens=300,
        )
-        
+
        # Add GPT-4's response to the case
        case_with_response = case.copy()
        case_with_response["gpt4_response"] = response.choices[0].message.content
@ -77,11 +77,11 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
        print(f"Error processing {case.get('id', 'unknown')}: {str(e)}")
        return case_with_response
-def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str, 
+
-                      num_workers: int = 8, model: str = "gpt-4o") -> None:
+def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str, num_workers: int = 8, model: str = "gpt-4o") -> None:
    """
    Process each line in the JSONL file by sending requests to GPT-4 in parallel.
-    
+
    Args:
        input_file: Path to the input JSONL file
        output_file: Path to write the output JSONL file with responses
@ -91,25 +91,24 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
        model: The model to use
    """
    # Read all test cases from the input file
-    with open(input_file, 'r') as f:
+    with open(input_file, "r") as f:
        lines = f.readlines()
-    
+
    # Parse each line to get test cases
    test_cases = []
    for line in lines:
        if line.strip():
            test_cases.append(json.loads(line))
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=api_key)
-    
+
    # Process test cases in parallel
    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Submit all tasks
        future_to_case = {executor.submit(process_test_case, case, client, pdf_dir, model): case for case in test_cases}
-        
+
        # Process results as they complete
        for future in tqdm(as_completed(future_to_case), total=len(test_cases), desc="Processing test cases"):
            try:
@ -121,53 +120,40 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
                # Add failed case with error message
                case["gpt4_response"] = f"PROCESSING_ERROR: {str(e)}"
                results.append(case)
-    
+
    # Filter for cases where GPT-4 responded with "NO"
-    no_responses = [result for result in results 
+    no_responses = [result for result in results if "gpt4_response" in result and result["gpt4_response"].startswith("NO")]
-                   if "gpt4_response" in result 
+
                   and result["gpt4_response"].startswith("NO")]
    # Write filtered results to output file
-    with open(output_file, 'w') as f:
+    with open(output_file, "w") as f:
        for result in no_responses:
-            f.write(json.dumps(result) + '\n')
+            f.write(json.dumps(result) + "\n")
-    
+
    print(f"Processed {len(results)} test cases. Found {len(no_responses)} cases with 'NO' responses. Results written to {output_file}")
 def main():
    parser = argparse.ArgumentParser(description="Process multi_column.jsonl with GPT-4 to check text regions")
-    parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", 
+    parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", help="Path to input JSONL file")
-                        help="Path to input JSONL file")
+    parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", help="Path to output JSONL file")
-    parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", 
+    parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", help="Directory containing the PDF files")
-                        help="Path to output JSONL file")
+    parser.add_argument("--workers", type=int, default=8, help="Number of parallel workers")
-    parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", 
+    parser.add_argument("--model", default="gpt-4.1", help="OpenAI model to use")
-                        help="Directory containing the PDF files")
+    parser.add_argument("--api-key", help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
-    parser.add_argument("--workers", type=int, default=8, 
+
                        help="Number of parallel workers")
    parser.add_argument("--model", default="gpt-4.1", 
                        help="OpenAI model to use")
    parser.add_argument("--api-key", 
                        help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
    args = parser.parse_args()
-    
+
    # Get API key from arguments or environment variable
    api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OpenAI API key must be provided either via --api-key or OPENAI_API_KEY environment variable")
-    
+
    # Verify that the PDF directory exists
    if not os.path.isdir(args.pdf_dir):
        raise ValueError(f"PDF directory {args.pdf_dir} does not exist")
-    
+
-    process_jsonl_file(
+    process_jsonl_file(input_file=args.input, output_file=args.output, api_key=api_key, pdf_dir=args.pdf_dir, num_workers=args.workers, model=args.model)
-        input_file=args.input,
+
        output_file=args.output,
        api_key=api_key,
        pdf_dir=args.pdf_dir,
        num_workers=args.workers,
        model=args.model
    )
 if __name__ == "__main__":
-    main()
+    main()
--- a/scripts/rich_tagging_pipeline.py
+++ b/scripts/rich_tagging_pipeline.py
@ -75,6 +75,7 @@ class PIIClassification(BaseModel):
    is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv")
    contains_pii: Optional[bool] = Field(..., description="True if document contains PII")
 class RichPIIClassification(BaseModel):
    primary_language: str = Field(..., description="Primary language as a two-letter code")
    document_type: str = Field(..., description="Basic summary of document type classification")
@ -100,12 +101,19 @@ class RichPIIClassification(BaseModel):
    def contains_any_pii(self) -> bool:
        if self.is_public_document:
            return False
-        
+
        if self.contains_pii_government_id or self.contains_pii_financial_info or self.contains_pii_biometric_data or self.contains_pii_login_info:
            return True
-        
+
        if self.contains_identifier_name or self.contains_identifier_email or self.contains_identifier_phone_number:
-            return self.contains_identifier_with_address or self.contains_identifier_with_biographical_info or self.contains_identifier_with_location_info or self.contains_identifier_with_employment_info or self.contains_identifier_with_education_info or self.contains_identifier_with_medical_info
+            return (
                self.contains_identifier_with_address
                or self.contains_identifier_with_biographical_info
                or self.contains_identifier_with_location_info
                or self.contains_identifier_with_employment_info
                or self.contains_identifier_with_education_info
                or self.contains_identifier_with_medical_info
            )
        else:
            return False
@ -114,8 +122,6 @@ async def _process_single_page(page_text: str) -> RichPIIClassification:
    """Helper function to process a single document or page."""
    text = page_text
    basic_prompt = "Given the text above, determine what type of document it is, and if it's a resume/CV. answer in JSON. The format of your json object should be {'primary_language': str, 'document_type': str, 'is_resume_cv': bool, 'contains_pii': bool}"
    rich_prompt = """You are a document analyzer that identifies Personally Identifiable Information (PII) in documents. 
 Your task is to analyze the provided document image and determine:
 1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.)
@ -157,17 +163,14 @@ Only consider actual occurrences of the PII within the document shown."""
                "content": [
                    {
                        "type": "text",
-                        "text": (
+                        "text": (f"{text}\n\n-----------\n" f"{rich_prompt}"),
                            f"{text}\n\n-----------\n"
                            f"{rich_prompt}"
                        ),
                    }
                ],
            }
        ],
        "max_tokens": 100,
        "temperature": 0.0,
-        "response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichRIIClassification.model_json_schema()}},
+        "response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichPIIClassification.model_json_schema()}},
    }
    url = f"http://localhost:{SERVER_PORT}/v1/chat/completions"
@ -186,7 +189,7 @@ Only consider actual occurrences of the PII within the document shown."""
        logger.warning(f"Server HTTP {status}: {body[:250]!r}")
        metrics.add_metrics(server_errors=1)
        return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
-    
+
    # ---------- Parse base JSON --------------------------------------------
    try:
        base = json.loads(body)
@ -209,12 +212,12 @@ Only consider actual occurrences of the PII within the document shown."""
        logger.warning(f"Missing fields in Server response: {e!s}")
        metrics.add_metrics(server_errors=1)
        return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
-    
+
    if not isinstance(content, str):
        logger.warning("Server `content` is not a string; treating as error.")
        metrics.add_metrics(server_errors=1)
        return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
-    
+
    try:
        pii_classification: RichPIIClassification = RichPIIClassification.model_validate_json(content)
        return pii_classification
@ -222,7 +225,7 @@ Only consider actual occurrences of the PII within the document shown."""
        logger.warning(f"Unable to parse pii classification object: {e!s}")
        metrics.add_metrics(server_errors=1)
        return RichPIIClassification(primary_language="en", document_type="unknown", is_public_document=False)
-    
+
 # Manual simple implementation of HTTP Post
 # It feels strange perhaps, but httpx and aiohttp are very complex beasts