diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index e27a55b..798de33 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -144,7 +144,7 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p bool: True if rendering was successful with exactly one page, False otherwise """ scale_factors = [1.0, 0.9, 0.8, 0.7] # Try these scale factors in order - + for scale in scale_factors: try: async with async_playwright() as p: @@ -162,7 +162,7 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p ) await browser.close() - + # Check if the output PDF has exactly one page try: reader = pypdf.PdfReader(output_pdf_path) @@ -175,11 +175,11 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p except Exception as pdf_check_error: print(f"Error checking PDF page count: {pdf_check_error}") return False - + except Exception as e: print(f"Error rendering PDF with Playwright at scale {scale}: {str(e)}") # Try the next scale factor - + print("Failed to render PDF as a single page with any scale factor") return False @@ -209,35 +209,37 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L def create_absence_tests_from_elements(parent_element, element_type): # Find all text-containing elements within the parent text_elements = [] - + # First get direct text nodes within spans, divs, p, and heading tags - for tag in parent_element.find_all(['span', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + for tag in parent_element.find_all(["span", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]): text = tag.get_text().strip() if text: text_elements.append(text) - + # If no elements found, use the parent's text as a fallback if not text_elements: parent_text = parent_element.get_text().strip() if parent_text: text_elements.append(parent_text) - + # Create tests for each text element for text in text_elements: if len(text) > 3: # Only create tests for meaningful text - tests.append({ - "pdf": pdf_filename, - "page": page_num, - "id": f"{pdf_id}_{element_type}_{uuid.uuid4().hex[:8]}", - "type": TestType.ABSENT.value, - "text": text, - "max_diffs": 5 - }) - + tests.append( + { + "pdf": pdf_filename, + "page": page_num, + "id": f"{pdf_id}_{element_type}_{uuid.uuid4().hex[:8]}", + "type": TestType.ABSENT.value, + "text": text, + "max_diffs": 5, + } + ) + # Create TextAbsenceTests for headers for header in headers: create_absence_tests_from_elements(header, "header") - + # Create TextAbsenceTests for footers for footer in footers: create_absence_tests_from_elements(footer, "footer") @@ -377,7 +379,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L if len(paragraphs) >= 2: # Extract all text from the main content all_text = " ".join([p.get_text().strip() for p in paragraphs]) - + # Use syntok to segment the text into sentences sentences = [] for paragraph in process(all_text): @@ -386,7 +388,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L sentence_text = " ".join([token.value for token in sentence]).strip() if sentence_text and len(sentence_text) > 10 and len(sentence_text) < 100: sentences.append(sentence_text) - + # Create TextOrderTests from pairs of sentences that are at least 3 sentences apart # to ensure they're from different parts of the document if len(sentences) >= 5: @@ -395,19 +397,21 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L # Get two random indices with sufficient distance between them i = random.randint(0, len(sentences) - 4) j = random.randint(i + 3, min(i + 10, len(sentences) - 1)) - + first_sentence = sentences[i] second_sentence = sentences[j] - - tests.append({ - "pdf": pdf_filename, - "page": page_num, - "id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}", - "type": TestType.ORDER.value, - "before": first_sentence, - "after": second_sentence, - "max_diffs": 10, - }) + + tests.append( + { + "pdf": pdf_filename, + "page": page_num, + "id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}", + "type": TestType.ORDER.value, + "before": first_sentence, + "after": second_sentence, + "max_diffs": 10, + } + ) return tests @@ -487,7 +491,7 @@ def process_pdf(pdf_info, args, client): # Run the async function in the synchronous context render_success = asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height)) - + if render_success: print(f"Successfully rendered with Playwright: {playwright_pdf_path}") else: @@ -501,7 +505,7 @@ def process_pdf(pdf_info, args, client): print(f"Failed to render with Playwright: {e}") playwright_pdf_path = None render_success = False - + # If playwright rendering failed and was required, return None to skip this test if not args.skip_playwright and not render_success: return None diff --git a/scripts/scan_dolmadocs.py b/scripts/scan_dolmadocs.py index 0bc9e62..af00a42 100644 --- a/scripts/scan_dolmadocs.py +++ b/scripts/scan_dolmadocs.py @@ -79,8 +79,8 @@ def list_result_files(s3_client, workspace_path): if "Contents" in page: all_files.extend([f"s3://{bucket}/{obj['Key']}" for obj in page["Contents"] if obj["Key"].endswith(".jsonl") or obj["Key"].endswith(".json")]) - if len(all_files) > 1000: - break + # if len(all_files) > 1000: + # break return all_files @@ -243,14 +243,13 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, }} .info-item h3 {{ - font-size: 0.875rem; + font-size: 0.6rem; color: var(--text-light); margin-bottom: 0.25rem; }} .info-item p {{ - font-size: 1rem; - font-weight: 500; + font-size: 0.6rem; }} .page-grid {{ @@ -317,16 +316,46 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, text-decoration: underline; }} + /* New button group styling for connected Yes/No buttons */ + .btn-group {{ + display: inline-flex; + margin-bottom: 0.5rem; + }} + + .btn-group .toggle-button {{ + padding: 0.5rem 1rem; + border: 1px solid var(--border-color); + background-color: #f8fafc; + cursor: pointer; + margin: 0; + /* Remove individual border radius so we can set unified ones */ + border-radius: 0; + }} + + .btn-group .toggle-button:first-child {{ + border-right: none; + border-top-left-radius: 0.25rem; + border-bottom-left-radius: 0.25rem; + }} + + .btn-group .toggle-button:last-child {{ + border-top-right-radius: 0.25rem; + border-bottom-right-radius: 0.25rem; + }} + .feedback {{ margin-top: 0.5rem; padding: 0.5rem; border-top: 1px solid var(--border-color); }} - .feedback label {{ - margin-right: 1rem; - font-size: 0.875rem; - color: var(--text-light); + .feedback .toggle-group {{ + margin-bottom: 0.5rem; + }} + + .toggle-button.active {{ + background-color: var(--primary-color); + color: white; }} .feedback textarea {{ @@ -370,7 +399,43 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
A visual survey of randomly selected pages from processed documents
+Information that identifies a data subject without further context
+Information that can be used to identify a data subject in context or in combination with other information
+