mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-27 07:05:05 +00:00
Tinyhosting automatically
This commit is contained in:
parent
02cd002488
commit
cd9e370c92
@ -6,6 +6,7 @@ import random
|
||||
import re
|
||||
import sqlite3
|
||||
import tempfile
|
||||
import tinyhost
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
@ -488,7 +489,12 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
<body>
|
||||
<div class="container">
|
||||
<header>
|
||||
<h1>OLMOCR Random Samples</h1>
|
||||
<p>
|
||||
<strong>Instructions: </strong>Please review each document below and mark if it contains PII (Personally identifiable information). If you cannot read it (ex. the document is not in English, or is otherwise unreadable), mark it as such.
|
||||
If the document contains disturbing or graphic content, please mark that. Finally, if there is PII, type in a brief description and press Enter. Once you mark all 30 documents, the completetion code will
|
||||
be presented.
|
||||
</p>
|
||||
|
||||
<div style="display: flex; font-family: Arial, sans-serif; font-size: 14px; max-width: 1000px; margin: 0 auto;">
|
||||
<div style="flex: 1; padding: 15px; background-color: #f5f7f9; border-radius: 8px; margin-right: 10px;">
|
||||
<h3 style="color: #2c3e50; margin-top: 0; border-bottom: 1px solid #ddd; padding-bottom: 8px;">PII Direct Identifiers</h3>
|
||||
@ -526,6 +532,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="info-bar">
|
||||
@ -644,10 +651,6 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
<div class="progress-fill" id="progress-fill"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<footer>
|
||||
<p>Generated by OLMOCR Sampling Tool</p>
|
||||
</footer>
|
||||
</div>
|
||||
<script>
|
||||
// Using externally injected async functions: fetchDatastore() and putDatastore()
|
||||
@ -868,6 +871,8 @@ def main():
|
||||
print(f"Found {len(result_files)} result files")
|
||||
|
||||
# Use ThreadPoolExecutor to parallelize the generation of sample sets
|
||||
output_files = []
|
||||
|
||||
if args.repeats > 1:
|
||||
print(f"Using ThreadPoolExecutor with {min(args.max_workers, args.repeats)} workers")
|
||||
with ThreadPoolExecutor(max_workers=min(args.max_workers, args.repeats)) as executor:
|
||||
@ -880,12 +885,22 @@ def main():
|
||||
for future in futures:
|
||||
try:
|
||||
output_filename = future.result()
|
||||
output_files.append(output_filename)
|
||||
print(f"Completed generation of {output_filename}")
|
||||
except Exception as e:
|
||||
print(f"Error generating sample set: {e}")
|
||||
else:
|
||||
# If only one repeat, just run it directly
|
||||
generate_sample_set(args, 0, s3_client, pdf_s3_client, result_files)
|
||||
output_filename = generate_sample_set(args, 0, s3_client, pdf_s3_client, result_files)
|
||||
output_files.append(output_filename)
|
||||
|
||||
# Now upload each resulting file into tinyhost
|
||||
print("Generated all files, uploading tinyhost links now")
|
||||
links = []
|
||||
for output_filename in output_files:
|
||||
link = tinyhost.tinyhost([str(output_filename)])
|
||||
links.append(link)
|
||||
print(link)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user