Removing unused file

2025-12-14 16:53:18 +00:00 · 2025-06-24 22:06:01 +00:00 · 2025-06-24 22:06:01 +00:00 · 67e9ec873f
commit 67e9ec873f
parent 24a2f9b0a4
1 changed files with 0 additions and 86 deletions
--- a/olmocr/loadertest.py
+++ b/olmocr/loadertest.py
@ -1,86 +0,0 @@
 import json
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import boto3
 from tqdm import tqdm
 # Configuration
 BUCKET = "ai2-llm"
 PREFIX = "pretraining-data/sources/soldni-open-access-books/v0/pipeline/results"
 OUTPUT_FILENAME = "all_completed_files.txt"
 def process_file(key: str):
    """
    Process a single S3 file given by its key.
    Reads a jsonl file from S3, decodes each line,
    extracts the 'Source-File' from the 'metadata' field,
    and returns a list of these source file strings.
    """
    # Create a new S3 client in the worker thread (thread-safe)
    s3 = boto3.client("s3")
    extracted_lines = []
    try:
        response = s3.get_object(Bucket=BUCKET, Key=key)
        for raw_line in response["Body"].iter_lines():
            try:
                # Decode the line from bytes to text
                line_str = raw_line.decode("utf-8")
            except UnicodeDecodeError as e:
                print(f"Skipping a line in {key} due to decode error: {e}")
                continue
            try:
                data = json.loads(line_str)
            except json.JSONDecodeError as e:
                print(f"Skipping a malformed json line in {key}: {e}")
                continue
            # Extract 'Source-File' from metadata if present
            metadata = data.get("metadata", {})
            source_file = metadata.get("Source-File")
            if source_file:
                extracted_lines.append(source_file)
    except Exception as e:
        print(f"Error processing file {key}: {e}")
    return extracted_lines
 def main():
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")
    page_iterator = paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
    # Gather all S3 object keys under the specified prefix
    keys = []
    for page in page_iterator:
        if "Contents" not in page:
            continue
        for obj in page["Contents"]:
            keys.append(obj["Key"])
    print(f"Found {len(keys)} files to process.")
    # Open the output file for writing
    with open(OUTPUT_FILENAME, "w", encoding="utf-8") as output_file:
        # Create a thread pool to process files concurrently.
        # Adjust max_workers based on your environment and workload.
        with ProcessPoolExecutor() as executor:
            # Submit all processing jobs and map each future to its key
            future_to_key = {executor.submit(process_file, key): key for key in keys}
            # Use tqdm to wrap the as_completed iterator for progress display
            for future in tqdm(as_completed(future_to_key), total=len(future_to_key), desc="Processing files"):
                try:
                    source_files = future.result()
                    # Write each extracted line to the output file as soon as the future completes
                    for source in source_files:
                        output_file.write(source + "\n")
                    # Optionally flush after each completed task
                    output_file.flush()
                except Exception as e:
                    key = future_to_key[future]
                    print(f"Exception occurred for file {key}: {e}")
    print(f"Finished writing the source file names to {OUTPUT_FILENAME}")
 if __name__ == "__main__":
    main()