olmocr/scripts/rich_autoscan_dolmadocs.py

656 lines
27 KiB
Python
Raw Normal View History

2025-05-06 20:49:39 +00:00
#!/usr/bin/env python3
"""
Rich Autoscan Dolma Documents with ChatGPT Vision
This script combines the functionality of autoscan_dolmadocs.py and rich_tagging_pipeline.py:
1. Uses ChatGPT Vision API to analyze PDF pages for PII
2. Creates attribute folders mirroring the document structure
3. Simplifies with direct ThreadPoolExecutor usage instead of work queue system
"""
import argparse
import gzip
import json
import logging
import os
import random
import tempfile
from concurrent.futures import ThreadPoolExecutor
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import boto3
import pydantic
import zstandard as zstd
from openai import OpenAI
from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.s3_utils import get_s3_bytes, parse_s3_path
# Initialize logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.propagate = False
file_handler = logging.FileHandler("rich-autoscan-debug.log", mode="a")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
# Add handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)
LanguageCode = Enum(
"LanguageCode",
{
"en": "English",
"zh": "Chinese",
"hi": "Hindi",
"es": "Spanish",
"fr": "French",
"ar": "Arabic",
"bn": "Bengali",
"ru": "Russian",
"pt": "Portuguese",
"ur": "Urdu",
"id": "Indonesian",
"de": "German",
"ja": "Japanese",
"sw": "Swahili",
"mr": "Marathi",
"te": "Telugu",
"tr": "Turkish",
"vi": "Vietnamese",
"ta": "Tamil",
"ko": "Korean",
"other": "Other",
},
)
class PIIAnnotation(pydantic.BaseModel):
"""Structured model for PII annotations returned by ChatGPT"""
document_description: str
language_code: LanguageCode
cannot_read: bool
inappropriate_content: bool
is_public_document: bool
# PII identifiers
contains_names: bool
contains_email_addresses: bool
contains_phone_numbers: bool
# PII that must co-occur with identifiers
contains_addresses: bool
contains_biographical_info: bool # DOB, gender, etc.
contains_location_info: bool
contains_employment_info: bool
contains_education_info: bool
contains_medical_info: bool
# Always sensitive PII
contains_government_ids: bool
contains_financial_info: bool
contains_biometric_data: bool
contains_login_info: bool
other_pii: str
@property
def has_pii(self) -> bool:
"""Check if the document contains any PII"""
pii_fields = [
self.contains_names,
self.contains_email_addresses,
self.contains_phone_numbers,
self.contains_addresses,
self.contains_biographical_info,
self.contains_location_info,
self.contains_employment_info,
self.contains_education_info,
self.contains_medical_info,
self.contains_government_ids,
self.contains_financial_info,
self.contains_biometric_data,
self.contains_login_info,
]
return any(pii_fields) or bool(self.other_pii.strip())
def get_pii_types(self) -> List[str]:
"""Get a list of all PII types found in the document"""
pii_types = []
if self.contains_names:
pii_types.append("names")
if self.contains_email_addresses:
pii_types.append("email")
if self.contains_phone_numbers:
pii_types.append("phone")
if self.contains_addresses:
pii_types.append("addresses")
if self.contains_biographical_info:
pii_types.append("biographical")
if self.contains_location_info:
pii_types.append("location")
if self.contains_employment_info:
pii_types.append("employment")
if self.contains_education_info:
pii_types.append("education")
if self.contains_medical_info:
pii_types.append("medical")
if self.contains_government_ids:
pii_types.append("government-id")
if self.contains_financial_info:
pii_types.append("financial")
if self.contains_biometric_data:
pii_types.append("biometric")
if self.contains_login_info:
pii_types.append("login-info")
if self.other_pii.strip():
pii_types.append("other")
return pii_types
def parse_args():
parser = argparse.ArgumentParser(description="Rich Autoscan for OLMO OCR workspace using ChatGPT Vision")
parser.add_argument("workspace", help="OLMO OCR workspace path (s3://bucket/workspace)")
parser.add_argument("--workspace_profile", help="AWS profile for accessing workspace (documents and attributes)")
parser.add_argument("--pdf_profile", help="AWS profile for accessing PDFs (can be different from workspace profile)")
parser.add_argument("--output_dir", default="dolma_samples", help="Directory to save output files")
parser.add_argument("--max_workers", type=int, default=16, help="Maximum number of worker threads")
parser.add_argument("--openai_api_key", help="OpenAI API key (or set OPENAI_API_KEY env var)")
parser.add_argument("--openai_model", default="gpt-4.1", help="OpenAI model to use")
parser.add_argument("--attribute_name", default="chatgpt_pii_vision", help="Path to use for attribute naming")
parser.add_argument("--batch_size", type=int, default=1000, help="Number of documents to process in each batch")
return parser.parse_args()
def list_result_files(s3_client, workspace_path):
"""List all JSONL files in the workspace documents directory."""
bucket, prefix = parse_s3_path(workspace_path)
documents_prefix = os.path.join(prefix, "documents").rstrip("/") + "/"
all_files = []
paginator = s3_client.get_paginator("list_objects_v2")
2025-05-06 21:22:27 +00:00
2025-05-06 20:49:39 +00:00
logger.info(f"Listing files from s3://{bucket}/{documents_prefix}")
for page in paginator.paginate(Bucket=bucket, Prefix=documents_prefix):
if "Contents" in page:
2025-05-06 21:22:27 +00:00
all_files.extend(
[
f"s3://{bucket}/{obj['Key']}"
for obj in page["Contents"]
if (
obj["Key"].endswith(".jsonl")
or obj["Key"].endswith(".json")
or obj["Key"].endswith(".jsonl.gz")
or obj["Key"].endswith(".jsonl.zst")
or obj["Key"].endswith(".jsonl.ztd")
or obj["Key"].endswith(".jsonl.zstd")
)
]
)
2025-05-06 20:49:39 +00:00
if len(all_files) % 100 == 0:
logger.info(f"Found {len(all_files)} files so far...")
logger.info(f"Total files found: {len(all_files)}")
return all_files
2025-05-06 21:22:27 +00:00
def load_document_file(s3_client, file_path):
"""Load a single document file and return its contents."""
try:
# Fetch raw bytes (S3 or local)
if file_path.startswith("s3://"):
raw = get_s3_bytes(s3_client, file_path)
else:
with open(file_path, "rb") as f:
raw = f.read()
# Decompress if needed
if file_path.endswith(".gz"):
file_bytes = gzip.decompress(raw)
elif file_path.endswith(".zst") or file_path.endswith(".ztd") or file_path.endswith(".zstd"):
dctx = zstd.ZstdDecompressor()
file_bytes = dctx.decompress(raw, max_output_size=1_000_000_000)
else:
file_bytes = raw
2025-05-06 20:49:39 +00:00
2025-05-06 21:22:27 +00:00
# Return the decoded lines
return file_bytes.decode("utf-8").strip().split("\n")
except Exception as e:
logger.error(f"Error loading file {file_path}: {e}")
return []
2025-05-06 20:49:39 +00:00
2025-05-06 21:22:27 +00:00
def get_document_info_from_line(line, file_path, line_index):
"""Extract document information from a single line."""
try:
doc = json.loads(line)
# A Dolma document has "text", "metadata", and "attributes" fields
if "text" not in doc or "metadata" not in doc or "attributes" not in doc:
logger.warning(f"Document in {file_path} line {line_index} is not a valid Dolma document")
return None
# Get the original PDF path from metadata
pdf_path = doc["metadata"].get("Source-File")
if not pdf_path:
return None
# Get page spans from attributes
page_spans = doc["attributes"].get("pdf_page_numbers", [])
if not page_spans:
return None
# Just use the first page for each document
if page_spans:
page_span = page_spans[0] # Just get the first page
if len(page_span) >= 3:
# Page spans are [start_pos, end_pos, page_num]
page_num = page_span[2]
# Extract text for this page
start_pos, end_pos = page_span[0], page_span[1]
page_text = doc["text"][start_pos:end_pos].strip()
# Return the information
return {
"pdf_path": pdf_path,
"page_num": page_num,
"page_text": page_text,
"start_pos": start_pos,
"end_pos": end_pos,
"doc_id": doc["id"],
"source_file": file_path,
"line_index": line_index,
}
return None
except json.JSONDecodeError:
logger.warning(f"Invalid JSON in {file_path} line {line_index}")
return None
except Exception as e:
logger.warning(f"Error processing document in {file_path} line {line_index}: {e}")
return None
def get_all_pages(s3_client, document_files):
"""Get all pages from the document files for processing, preserving file and line order."""
file_contents = {}
# First, collect all file paths and their document info
for file_path in tqdm(document_files, desc="Loading document files"):
lines = load_document_file(s3_client, file_path)
if not lines:
logger.warning(f"Empty or invalid file: {file_path}")
2025-05-06 20:49:39 +00:00
continue
2025-05-06 21:22:27 +00:00
# Parse each line for document info
documents = []
for i, line in enumerate(lines):
doc_info = get_document_info_from_line(line, file_path, i)
# Always add an entry for each line, even if None, to preserve line alignment
documents.append(doc_info)
# Store all documents for this file
file_contents[file_path] = documents
logger.info(f"Loaded {len(documents)} documents from {file_path}")
logger.info(f"Loaded documents from {len(file_contents)} files")
return file_contents
2025-05-06 20:49:39 +00:00
def chatgpt_analyze_page(pdf_path: str, page_num: int, pdf_s3_client, openai_api_key: str, openai_model: str) -> Optional[PIIAnnotation]:
"""Analyze a page using the ChatGPT vision model with structured outputs."""
try:
# Download PDF to temp file and render to image
bucket, key = parse_s3_path(pdf_path)
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
pdf_data = pdf_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read()
temp_file.write(pdf_data)
temp_file_path = temp_file.name
# Render PDF to base64 image
base64_image = render_pdf_to_base64png(temp_file_path, page_num, target_longest_image_dim=2048)
# Clean up temp file
os.unlink(temp_file_path)
# Create OpenAI client
client = OpenAI(api_key=openai_api_key)
# Prepare the user message with all instructions - Keep exact prompt from autoscan_dolmadocs.py
user_message = """
You are a document analyzer that identifies Personally Identifiable Information (PII) in documents.
Your task is to analyze the provided document image and determine:
1. Whether the document is intended for public release or dissemination (e.g., research paper, public report, etc.)
2. If the document contains any PII
For PII identification, follow these specific guidelines:
IDENTIFIERS FOR PII:
The following are considered identifiers that can make information PII:
- Names (full names, first names, last names, nicknames)
- Email addresses
- Phone numbers
PII THAT MUST CO-OCCUR WITH AN IDENTIFIER:
The following types of information should ONLY be marked as PII if they occur ALONGSIDE an identifier (commonly, a person's name):
- Addresses (street address, postal code, etc.)
- Biographical Information (date of birth, place of birth, gender, sexual orientation, race, ethnicity, citizenship/immigration status, religion)
- Location Information (geolocations, specific coordinates)
- Employment Information (job titles, workplace names, employment history)
- Education Information (school names, degrees, transcripts)
- Medical Information (health records, diagnoses, genetic or neural data)
PII THAT OCCURS EVEN WITHOUT AN IDENTIFIER:
The following should ALWAYS be marked as PII even if they do not occur alongside an identifier:
- Government IDs (Social Security Numbers, passport numbers, driver's license numbers, tax IDs)
- Financial Information (credit card numbers, bank account/routing numbers)
- Biometric Data (fingerprints, retina scans, facial recognition data, voice signatures)
- Login information (ONLY mark as PII when a username, password, and login location are present together)
If the document is a form, then only consider fields which are filled out with specific values as potential PII.
If this page does not itself contain PII, but references documents (such as curriculum vitae, personal statements) that typically contain PII, then do not mark it as PII.
Only consider actual occurrences of the PII within the document shown.
"""
# Use the chat completions API with the custom schema
completion = client.beta.chat.completions.parse(
model=openai_model,
messages=[
{
"role": "user",
"content": [{"type": "text", "text": user_message}, {"type": "image_url", "image_url": {"url": f"data:image/webp;base64,{base64_image}"}}],
}
],
response_format=PIIAnnotation,
max_tokens=1000,
)
return completion.choices[0].message.parsed
except Exception as e:
logger.error(f"Error analyzing page {pdf_path} (page {page_num}): {e}")
return None
2025-05-06 21:22:27 +00:00
def process_single_page(args, doc_info, pdf_s3_client=None):
"""Process a single document and generate attribute data."""
# Skip if document info is None
if doc_info is None:
return None
# Extract info from the document info
pdf_path = doc_info["pdf_path"]
page_num = doc_info["page_num"]
start_pos = doc_info["start_pos"]
end_pos = doc_info["end_pos"]
doc_id = doc_info["doc_id"]
source_file = doc_info["source_file"]
line_index = doc_info["line_index"]
2025-05-06 20:49:39 +00:00
# Use provided PDF S3 client if given, otherwise create one
if pdf_s3_client is None:
if args.pdf_profile:
pdf_session = boto3.Session(profile_name=args.pdf_profile)
pdf_s3_client = pdf_session.client("s3")
else:
pdf_s3_client = boto3.client("s3")
2025-05-06 21:22:27 +00:00
2025-05-06 20:49:39 +00:00
# Get OpenAI API key
openai_api_key = args.openai_api_key or os.environ.get("OPENAI_API_KEY")
if not openai_api_key:
raise ValueError("OpenAI API key must be provided via --openai_api_key or OPENAI_API_KEY environment variable")
2025-05-06 21:22:27 +00:00
2025-05-06 20:49:39 +00:00
# Analyze page with ChatGPT
annotation = chatgpt_analyze_page(pdf_path, page_num, pdf_s3_client, openai_api_key, args.openai_model)
2025-05-06 21:22:27 +00:00
2025-05-06 20:49:39 +00:00
if not annotation:
logger.warning(f"No annotation for {pdf_path} page {page_num}")
2025-05-06 21:22:27 +00:00
return {
"id": doc_id,
"line_index": line_index,
"attributes": None,
"source_file": source_file,
}
2025-05-06 20:49:39 +00:00
# Generate attribute key names using model name
model_prefix = args.openai_model.replace("/", "_").replace("-", "_").replace(".", "_")
language_key_name = f"{model_prefix}_language"
contains_pii_key_name = f"{model_prefix}_contains_pii"
2025-05-06 21:22:27 +00:00
2025-05-06 20:49:39 +00:00
# Initialize result attributes with all PIIAnnotation fields
result_attributes = {
contains_pii_key_name: [[start_pos, end_pos, annotation.has_pii]],
language_key_name: [[start_pos, end_pos, annotation.language_code.name]],
f"{model_prefix}_is_public_document": [[start_pos, end_pos, annotation.is_public_document]],
f"{model_prefix}_contains_names": [[start_pos, end_pos, annotation.contains_names]],
f"{model_prefix}_contains_email_addresses": [[start_pos, end_pos, annotation.contains_email_addresses]],
f"{model_prefix}_contains_phone_numbers": [[start_pos, end_pos, annotation.contains_phone_numbers]],
f"{model_prefix}_contains_addresses": [[start_pos, end_pos, annotation.contains_addresses]],
f"{model_prefix}_contains_biographical_info": [[start_pos, end_pos, annotation.contains_biographical_info]],
f"{model_prefix}_contains_location_info": [[start_pos, end_pos, annotation.contains_location_info]],
f"{model_prefix}_contains_employment_info": [[start_pos, end_pos, annotation.contains_employment_info]],
f"{model_prefix}_contains_education_info": [[start_pos, end_pos, annotation.contains_education_info]],
f"{model_prefix}_contains_medical_info": [[start_pos, end_pos, annotation.contains_medical_info]],
f"{model_prefix}_contains_government_ids": [[start_pos, end_pos, annotation.contains_government_ids]],
f"{model_prefix}_contains_financial_info": [[start_pos, end_pos, annotation.contains_financial_info]],
f"{model_prefix}_contains_biometric_data": [[start_pos, end_pos, annotation.contains_biometric_data]],
f"{model_prefix}_contains_login_info": [[start_pos, end_pos, annotation.contains_login_info]],
f"{model_prefix}_other_pii": [[start_pos, end_pos, annotation.other_pii]],
}
2025-05-06 21:22:27 +00:00
# Return document ID, line index, and attributes
2025-05-06 20:49:39 +00:00
return {
"id": doc_id,
2025-05-06 21:22:27 +00:00
"line_index": line_index,
2025-05-06 20:49:39 +00:00
"attributes": result_attributes,
2025-05-06 21:22:27 +00:00
"source_file": source_file,
2025-05-06 20:49:39 +00:00
}
2025-05-06 21:22:27 +00:00
def write_attribute_file(args, processed_docs, file_documents, workspace_s3):
"""Write attribute results to the appropriate files, preserving exact line order."""
# Group results by source file and organize by line index
2025-05-06 20:49:39 +00:00
results_by_file = {}
2025-05-06 21:22:27 +00:00
for result in processed_docs:
2025-05-06 20:49:39 +00:00
if result is None:
continue
2025-05-06 21:22:27 +00:00
2025-05-06 20:49:39 +00:00
source_file = result["source_file"]
if source_file not in results_by_file:
2025-05-06 21:22:27 +00:00
results_by_file[source_file] = {}
# Store by line index to preserve order
results_by_file[source_file][result["line_index"]] = {"id": result["id"], "attributes": result["attributes"]}
2025-05-06 20:49:39 +00:00
# Process each source file
2025-05-06 21:22:27 +00:00
for source_file, file_results_dict in results_by_file.items():
2025-05-06 20:49:39 +00:00
try:
# 1. Build the relative path that mirrors documents/…
if source_file.startswith("s3://"):
_, key = parse_s3_path(source_file)
_, docs_prefix = parse_s3_path(args.workspace)
rel_path = key[len(os.path.join(docs_prefix, "documents/")) :]
else:
docs_root = os.path.join(args.workspace, "documents")
rel_path = os.path.relpath(source_file, docs_root)
2025-05-06 21:22:27 +00:00
# 2. Create ordered attribute entries in exact same order as source file
file_entries = []
# Get the original documents to ensure we have ALL lines in order
original_docs = file_documents[source_file]
# Create attribute entries for every line
for i, doc_info in enumerate(original_docs):
if i in file_results_dict and file_results_dict[i]["attributes"] is not None:
# We have a processed result for this line
file_entries.append(file_results_dict[i])
elif doc_info is not None:
# We have document info but no processed attributes (processing failed)
# Create an empty attributes entry with the correct ID
file_entries.append({"id": doc_info["doc_id"], "attributes": {}})
else:
# This line in the source file was invalid or not a document
# Create a placeholder with a generated ID
placeholder_id = f"placeholder_{source_file}_{i}"
file_entries.append({"id": placeholder_id, "attributes": {}})
# 3. Create output JSONL
2025-05-06 20:49:39 +00:00
out_rel = os.path.join("attributes", args.attribute_name, rel_path)
2025-05-06 21:22:27 +00:00
out_jsonl = "\n".join(json.dumps(entry) for entry in file_entries) + "\n"
2025-05-06 20:49:39 +00:00
2025-05-06 21:22:27 +00:00
# 4. Preserve compression type
2025-05-06 20:49:39 +00:00
if rel_path.endswith(".gz"):
payload = gzip.compress(out_jsonl.encode("utf-8"))
elif rel_path.endswith((".zst", ".ztd", ".zstd")):
payload = zstd.ZstdCompressor().compress(out_jsonl.encode("utf-8"))
else:
payload = out_jsonl.encode("utf-8")
2025-05-06 21:22:27 +00:00
# 5. Write to args.workspace (local or S3)
2025-05-06 20:49:39 +00:00
if args.workspace.startswith("s3://"):
bucket, prefix = parse_s3_path(args.workspace)
key = os.path.join(prefix, out_rel)
workspace_s3.put_object(Bucket=bucket, Key=key, Body=payload)
2025-05-06 21:22:27 +00:00
logger.info(f"Wrote {len(file_entries)} attribute entries to s3://{bucket}/{key}")
2025-05-06 20:49:39 +00:00
else:
out_path = os.path.join(args.workspace, out_rel)
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "wb") as fh:
fh.write(payload)
2025-05-06 21:22:27 +00:00
logger.info(f"Wrote {len(file_entries)} attribute entries to {out_path}")
2025-05-06 20:49:39 +00:00
except Exception as e:
logger.error(f"Error writing attributes for {source_file}: {e}")
continue
def save_results(results, output_dir):
"""Save the full results to a JSON file for analysis."""
output_path = Path(output_dir) / "rich_autoscan_results.json"
# Convert results to serializable format
serializable_results = []
for result in results:
if result is None:
continue
serializable_results.append(result)
with open(output_path, "w") as f:
json.dump(serializable_results, f, indent=2, default=lambda o: o.value if isinstance(o, Enum) else o)
print(f"Results saved to {output_path}")
def main():
args = parse_args()
# Set up S3 clients with appropriate profiles
if args.workspace_profile:
workspace_session = boto3.Session(profile_name=args.workspace_profile)
workspace_s3 = workspace_session.client("s3")
logger.info(f"Using AWS profile '{args.workspace_profile}' for workspace access")
else:
workspace_s3 = boto3.client("s3")
logger.info("Using default AWS credentials for workspace access")
2025-05-06 21:22:27 +00:00
2025-05-06 20:49:39 +00:00
if args.pdf_profile:
pdf_session = boto3.Session(profile_name=args.pdf_profile)
pdf_s3 = pdf_session.client("s3")
logger.info(f"Using AWS profile '{args.pdf_profile}' for PDF access")
else:
# If no PDF profile specified, use the workspace profile or default
if args.workspace_profile:
pdf_s3 = workspace_s3
logger.info(f"Using workspace profile '{args.workspace_profile}' for PDF access")
else:
pdf_s3 = boto3.client("s3")
logger.info("Using default AWS credentials for PDF access")
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
# List all document files
logger.info(f"Listing document files in {args.workspace}/documents...")
document_files = list_result_files(workspace_s3, args.workspace)
logger.info(f"Found {len(document_files)} document files")
2025-05-06 21:22:27 +00:00
# Load all document files and their contents, organized by file
logger.info("Loading all document files...")
file_documents = get_all_pages(workspace_s3, document_files)
# Process each file individually
for file_index, (file_path, documents) in enumerate(file_documents.items()):
logger.info(f"Processing file {file_index+1}/{len(file_documents)}: {file_path}")
# Only process documents that have valid information
valid_docs = []
for doc in documents:
if doc is not None:
valid_docs.append(doc)
# Skip if no valid documents
if not valid_docs:
logger.warning(f"No valid documents in {file_path}")
continue
# Process in batches to manage memory and API rate limits
total_docs = len(valid_docs)
logger.info(f"Found {total_docs} valid documents to process in {file_path}")
# Process in batches (process by document, but maintain file coherence)
all_results = []
for i in range(0, total_docs, args.batch_size):
batch = valid_docs[i : i + args.batch_size]
batch_num = i // args.batch_size + 1
total_batches = (total_docs + args.batch_size - 1) // args.batch_size
logger.info(f"Processing batch {batch_num}/{total_batches} of {file_path} ({len(batch)} documents)...")
results = []
with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
futures = []
# Process documents in parallel but within the same file
for doc_info in batch:
futures.append(executor.submit(process_single_page, args, doc_info, pdf_s3))
for future in tqdm(futures, desc=f"Processing batch {batch_num} of {file_path}"):
try:
result = future.result()
results.append(result)
except Exception as e:
logger.error(f"Error processing document: {e}")
# Save results for this batch
batch_output_dir = os.path.join(args.output_dir, f"file_{file_index+1}_batch_{batch_num}")
os.makedirs(batch_output_dir, exist_ok=True)
save_results(results, batch_output_dir)
# Collect all results for this file
all_results.extend(results)
logger.info(f"Completed batch {batch_num}/{total_batches} of {file_path}")
# Write attributes for the entire file, maintaining line order
write_attribute_file(args, all_results, file_documents, workspace_s3)
logger.info(f"Completed processing file {file_index+1}/{len(file_documents)}: {file_path}")
logger.info(f"Processing complete - processed {len(file_documents)} files")
2025-05-06 20:49:39 +00:00
if __name__ == "__main__":
2025-05-06 21:22:27 +00:00
main()