Even more test cleanup

This commit is contained in:
Jake Poznanski 2025-08-22 18:56:56 +00:00
parent 9831e65161
commit 0df56e958e
2 changed files with 20 additions and 4 deletions

View File

@ -56,7 +56,7 @@ def download_and_extract_source(paper_id, data_dir):
def download_pdf(paper_id, data_dir):
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
pdf_url = f"https://export.arxiv.org/pdf/{paper_id}.pdf"
print(f"Downloading PDF for {paper_id} from {pdf_url}...")
response = requests.get(pdf_url)
if response.status_code != 200:
@ -105,7 +105,7 @@ def main():
if os.path.exists(tex_path):
os.remove(tex_path)
print(f"Removed tex file for {paper_id} because PDF download failed.")
time.sleep(1)
time.sleep(3)
if __name__ == "__main__":

View File

@ -800,12 +800,22 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
if sentence_str:
# Skip HTML table content that might still be in markdown
if not sentence_str.startswith('<') and not sentence_str.endswith('>'):
# Skip image placeholders - match any markdown image syntax ![...](...)
if re.search(r'!\[.*?\]\(.*?\)', sentence_str):
continue
# Remove leading # marks (markdown headers)
while sentence_str.startswith('#'):
sentence_str = sentence_str[1:]
sentence_str = sentence_str.strip()
if sentence_str: # Only add if there's still content after removing #
# Remove leading "- " for unordered lists
if sentence_str.startswith('- '):
sentence_str = sentence_str[2:]
sentence_str = sentence_str.strip()
if sentence_str: # Only add if there's still content after cleaning
sentences.append(sentence_str)
# Add a few random ordering tests
@ -1108,7 +1118,13 @@ def process_pdf(pdf_info, args, client, pdf_filter=None):
png_width, png_height = get_png_dimensions_from_base64(image_base64)
# Run the async function in the synchronous context
render_success = asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
# Create a new event loop to avoid conflicts
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
render_success = loop.run_until_complete(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
finally:
loop.close()
if render_success:
print(f"Successfully rendered with Playwright: {playwright_pdf_path}")