mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-13 08:59:11 +00:00
Even more test cleanup
This commit is contained in:
parent
9831e65161
commit
0df56e958e
@ -56,7 +56,7 @@ def download_and_extract_source(paper_id, data_dir):
|
|||||||
|
|
||||||
|
|
||||||
def download_pdf(paper_id, data_dir):
|
def download_pdf(paper_id, data_dir):
|
||||||
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
|
pdf_url = f"https://export.arxiv.org/pdf/{paper_id}.pdf"
|
||||||
print(f"Downloading PDF for {paper_id} from {pdf_url}...")
|
print(f"Downloading PDF for {paper_id} from {pdf_url}...")
|
||||||
response = requests.get(pdf_url)
|
response = requests.get(pdf_url)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
@ -105,7 +105,7 @@ def main():
|
|||||||
if os.path.exists(tex_path):
|
if os.path.exists(tex_path):
|
||||||
os.remove(tex_path)
|
os.remove(tex_path)
|
||||||
print(f"Removed tex file for {paper_id} because PDF download failed.")
|
print(f"Removed tex file for {paper_id} because PDF download failed.")
|
||||||
time.sleep(1)
|
time.sleep(3)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -800,12 +800,22 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
|||||||
if sentence_str:
|
if sentence_str:
|
||||||
# Skip HTML table content that might still be in markdown
|
# Skip HTML table content that might still be in markdown
|
||||||
if not sentence_str.startswith('<') and not sentence_str.endswith('>'):
|
if not sentence_str.startswith('<') and not sentence_str.endswith('>'):
|
||||||
|
# Skip image placeholders - match any markdown image syntax 
|
||||||
|
if re.search(r'!\[.*?\]\(.*?\)', sentence_str):
|
||||||
|
continue
|
||||||
|
|
||||||
# Remove leading # marks (markdown headers)
|
# Remove leading # marks (markdown headers)
|
||||||
while sentence_str.startswith('#'):
|
while sentence_str.startswith('#'):
|
||||||
sentence_str = sentence_str[1:]
|
sentence_str = sentence_str[1:]
|
||||||
sentence_str = sentence_str.strip()
|
sentence_str = sentence_str.strip()
|
||||||
|
|
||||||
if sentence_str: # Only add if there's still content after removing #
|
# Remove leading "- " for unordered lists
|
||||||
|
if sentence_str.startswith('- '):
|
||||||
|
sentence_str = sentence_str[2:]
|
||||||
|
|
||||||
|
sentence_str = sentence_str.strip()
|
||||||
|
|
||||||
|
if sentence_str: # Only add if there's still content after cleaning
|
||||||
sentences.append(sentence_str)
|
sentences.append(sentence_str)
|
||||||
|
|
||||||
# Add a few random ordering tests
|
# Add a few random ordering tests
|
||||||
@ -1108,7 +1118,13 @@ def process_pdf(pdf_info, args, client, pdf_filter=None):
|
|||||||
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
||||||
|
|
||||||
# Run the async function in the synchronous context
|
# Run the async function in the synchronous context
|
||||||
render_success = asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
|
# Create a new event loop to avoid conflicts
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
try:
|
||||||
|
render_success = loop.run_until_complete(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
|
||||||
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
if render_success:
|
if render_success:
|
||||||
print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
|
print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user