Tinyhosting automatically

This commit is contained in:
Jake Poznanski 2025-04-04 16:29:58 +00:00
parent 02cd002488
commit cd9e370c92

View File

@ -6,6 +6,7 @@ import random
import re
import sqlite3
import tempfile
import tinyhost
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Optional
@ -488,7 +489,12 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
<body>
<div class="container">
<header>
<h1>OLMOCR Random Samples</h1>
<p>
<strong>Instructions: </strong>Please review each document below and mark if it contains PII (Personally identifiable information). If you cannot read it (ex. the document is not in English, or is otherwise unreadable), mark it as such.
If the document contains disturbing or graphic content, please mark that. Finally, if there is PII, type in a brief description and press Enter. Once you mark all 30 documents, the completetion code will
be presented.
</p>
<div style="display: flex; font-family: Arial, sans-serif; font-size: 14px; max-width: 1000px; margin: 0 auto;">
<div style="flex: 1; padding: 15px; background-color: #f5f7f9; border-radius: 8px; margin-right: 10px;">
<h3 style="color: #2c3e50; margin-top: 0; border-bottom: 1px solid #ddd; padding-bottom: 8px;">PII Direct Identifiers</h3>
@ -526,6 +532,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
</ul>
</div>
</div>
</header>
<div class="info-bar">
@ -644,10 +651,6 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
<div class="progress-fill" id="progress-fill"></div>
</div>
</div>
<footer>
<p>Generated by OLMOCR Sampling Tool</p>
</footer>
</div>
<script>
// Using externally injected async functions: fetchDatastore() and putDatastore()
@ -868,6 +871,8 @@ def main():
print(f"Found {len(result_files)} result files")
# Use ThreadPoolExecutor to parallelize the generation of sample sets
output_files = []
if args.repeats > 1:
print(f"Using ThreadPoolExecutor with {min(args.max_workers, args.repeats)} workers")
with ThreadPoolExecutor(max_workers=min(args.max_workers, args.repeats)) as executor:
@ -880,12 +885,22 @@ def main():
for future in futures:
try:
output_filename = future.result()
output_files.append(output_filename)
print(f"Completed generation of {output_filename}")
except Exception as e:
print(f"Error generating sample set: {e}")
else:
# If only one repeat, just run it directly
generate_sample_set(args, 0, s3_client, pdf_s3_client, result_files)
output_filename = generate_sample_set(args, 0, s3_client, pdf_s3_client, result_files)
output_files.append(output_filename)
# Now upload each resulting file into tinyhost
print("Generated all files, uploading tinyhost links now")
links = []
for output_filename in output_files:
link = tinyhost.tinyhost([str(output_filename)])
links.append(link)
print(link)
if __name__ == "__main__":