Lints

2025-10-11 08:12:22 +00:00 · 2025-04-30 21:18:59 +00:00 · 2025-04-30 21:18:59 +00:00 · 472ee108d7
commit 472ee108d7
parent 8ef7e56c86
2 changed files with 66 additions and 77 deletions
--- a/olmocr/bench/miners/check_multicolumn.py
+++ b/olmocr/bench/miners/check_multicolumn.py
@ -1,15 +1,16 @@
 #!/usr/bin/env python3
+import argparse
 import json
 import os
-import argparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, List, Any, Optional
+from typing import Any, Dict

 import openai
 from tqdm import tqdm

 from olmocr.data.renderpdf import render_pdf_to_base64png

+
 def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "gpt-4o") -> Dict[str, Any]:
    """
    Send a request to GPT-4 asking if the before and after text appear in the same region.
@ -41,20 +42,19 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
                "content": [
                    {
                        "type": "text",
-                        "text": (f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
+                        "text": (
+                            f"Does the text in the 'before' field and the 'after' field appear in the same region of the page? "
                            f"Look at the PDF image and determine if these texts are located near each other or in completely "
                            f"different parts of the page. Different regions could be the captions for different images, or inside of different insets or tables. However, appearing the same column of text, or in the naturally flowing next column of text is close enough.\n\n"
                            f"Before: {before_text}\n\n"
                            f"After: {after_text}\n\n"
                            f"Respond with 'YES' if they appear in the same region or column, and 'NO' if they appear in "
-                                f"different regions. Then explain your reasoning in 1-2 sentences.")
+                            f"different regions. Then explain your reasoning in 1-2 sentences."
+                        ),
+                    },
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
            },
-                    {
-                        "type": "image_url", 
-                        "image_url": {"url": f"data:image/png;base64,{image_base64}"}
-                    }
-                ]
-            }
        ]

        # Call the API
@ -77,8 +77,8 @@ def process_test_case(case: Dict[str, Any], client, pdf_dir: str, model: str = "
        print(f"Error processing {case.get('id', 'unknown')}: {str(e)}")
        return case_with_response

-def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str, 
-                      num_workers: int = 8, model: str = "gpt-4o") -> None:
+
+def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir: str, num_workers: int = 8, model: str = "gpt-4o") -> None:
    """
    Process each line in the JSONL file by sending requests to GPT-4 in parallel.

@ -91,7 +91,7 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
        model: The model to use
    """
    # Read all test cases from the input file
-    with open(input_file, 'r') as f:
+    with open(input_file, "r") as f:
        lines = f.readlines()

    # Parse each line to get test cases
@ -100,7 +100,6 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
        if line.strip():
            test_cases.append(json.loads(line))

-    
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=api_key)

@ -123,31 +122,24 @@ def process_jsonl_file(input_file: str, output_file: str, api_key: str, pdf_dir:
                results.append(case)

    # Filter for cases where GPT-4 responded with "NO"
-    no_responses = [result for result in results 
-                   if "gpt4_response" in result 
-                   and result["gpt4_response"].startswith("NO")]
+    no_responses = [result for result in results if "gpt4_response" in result and result["gpt4_response"].startswith("NO")]

    # Write filtered results to output file
-    with open(output_file, 'w') as f:
+    with open(output_file, "w") as f:
        for result in no_responses:
-            f.write(json.dumps(result) + '\n')
+            f.write(json.dumps(result) + "\n")

    print(f"Processed {len(results)} test cases. Found {len(no_responses)} cases with 'NO' responses. Results written to {output_file}")

+
 def main():
    parser = argparse.ArgumentParser(description="Process multi_column.jsonl with GPT-4 to check text regions")
-    parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", 
-                        help="Path to input JSONL file")
-    parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", 
-                        help="Path to output JSONL file")
-    parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", 
-                        help="Directory containing the PDF files")
-    parser.add_argument("--workers", type=int, default=8, 
-                        help="Number of parallel workers")
-    parser.add_argument("--model", default="gpt-4.1", 
-                        help="OpenAI model to use")
-    parser.add_argument("--api-key", 
-                        help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")
+    parser.add_argument("--input", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column.jsonl", help="Path to input JSONL file")
+    parser.add_argument("--output", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/multi_column_gpt4_regions.jsonl", help="Path to output JSONL file")
+    parser.add_argument("--pdf-dir", default="/home/ubuntu/olmocr/olmOCR-bench/bench_data/pdfs", help="Directory containing the PDF files")
+    parser.add_argument("--workers", type=int, default=8, help="Number of parallel workers")
+    parser.add_argument("--model", default="gpt-4.1", help="OpenAI model to use")
+    parser.add_argument("--api-key", help="OpenAI API key (if not provided, uses OPENAI_API_KEY env var)")

    args = parser.parse_args()

@ -160,14 +152,8 @@ def main():
    if not os.path.isdir(args.pdf_dir):
        raise ValueError(f"PDF directory {args.pdf_dir} does not exist")

-    process_jsonl_file(
-        input_file=args.input,
-        output_file=args.output,
-        api_key=api_key,
-        pdf_dir=args.pdf_dir,
-        num_workers=args.workers,
-        model=args.model
-    )
+    process_jsonl_file(input_file=args.input, output_file=args.output, api_key=api_key, pdf_dir=args.pdf_dir, num_workers=args.workers, model=args.model)
+

 if __name__ == "__main__":
    main()
--- a/scripts/rich_tagging_pipeline.py
+++ b/scripts/rich_tagging_pipeline.py
@ -75,6 +75,7 @@ class PIIClassification(BaseModel):
    is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv")
    contains_pii: Optional[bool] = Field(..., description="True if document contains PII")

+
 class RichPIIClassification(BaseModel):
    primary_language: str = Field(..., description="Primary language as a two-letter code")
    document_type: str = Field(..., description="Basic summary of document type classification")
@ -105,7 +106,14 @@ class RichPIIClassification(BaseModel):
            return True

        if self.contains_identifier_name or self.contains_identifier_email or self.contains_identifier_phone_number:
-            return self.contains_identifier_with_address or self.contains_identifier_with_biographical_info or self.contains_identifier_with_location_info or self.contains_identifier_with_employment_info or self.contains_identifier_with_education_info or self.contains_identifier_with_medical_info
+            return (
+                self.contains_identifier_with_address
+                or self.contains_identifier_with_biographical_info
+                or self.contains_identifier_with_location_info
+                or self.contains_identifier_with_employment_info
+                or self.contains_identifier_with_education_info
+                or self.contains_identifier_with_medical_info
+            )
        else:
            return False

@ -114,8 +122,6 @@ async def _process_single_page(page_text: str) -> RichPIIClassification:
    """Helper function to process a single document or page."""
    text = page_text

-    basic_prompt = "Given the text above, determine what type of document it is, and if it's a resume/CV. answer in JSON. The format of your json object should be {'primary_language': str, 'document_type': str, 'is_resume_cv': bool, 'contains_pii': bool}"
-
    rich_prompt = """You are a document analyzer that identifies Personally Identifiable Information (PII) in documents. 
 Your task is to analyze the provided document image and determine:
 1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.)
@ -157,17 +163,14 @@ Only consider actual occurrences of the PII within the document shown."""
                "content": [
                    {
                        "type": "text",
-                        "text": (
-                            f"{text}\n\n-----------\n"
-                            f"{rich_prompt}"
-                        ),
+                        "text": (f"{text}\n\n-----------\n" f"{rich_prompt}"),
                    }
                ],
            }
        ],
        "max_tokens": 100,
        "temperature": 0.0,
-        "response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichRIIClassification.model_json_schema()}},
+        "response_format": {"type": "json_schema", "json_schema": {"name": "PIIClassification", "schema": RichPIIClassification.model_json_schema()}},
    }

    url = f"http://localhost:{SERVER_PORT}/v1/chat/completions"