| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  | import html | 
					
						
							|  |  |  | import multiprocessing | 
					
						
							| 
									
										
										
										
											2024-09-17 16:26:55 +00:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2024-09-17 19:59:02 +00:00
										 |  |  | import time | 
					
						
							| 
									
										
										
										
											2024-09-17 16:26:55 +00:00
										 |  |  | import unittest | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  | from pdelfin.filter.coherency import get_document_coherency | 
					
						
							| 
									
										
										
										
											2024-09-17 16:26:55 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 20:19:03 +00:00
										 |  |  | from pdelfin.prompts.anchor import get_anchor_text | 
					
						
							| 
									
										
										
										
											2024-09-17 16:26:55 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | class TestCoherencyScores(unittest.TestCase): | 
					
						
							|  |  |  |     def testBadOcr1(self): | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |         good_text = get_anchor_text( | 
					
						
							|  |  |  |             os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext" | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |         ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext") | 
					
						
							|  |  |  |         ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext") | 
					
						
							| 
									
										
										
										
											2024-09-17 16:26:55 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         print("Good", get_document_coherency(good_text)) | 
					
						
							| 
									
										
										
										
											2024-09-17 16:58:45 +00:00
										 |  |  |         print("Bad1", get_document_coherency(ocr1_text)) | 
					
						
							|  |  |  |         print("Bad2", get_document_coherency(ocr2_text)) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-17 19:59:02 +00:00
										 |  |  |     def testHugeBookCoherencySpeed(self): | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |         base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext") | 
					
						
							| 
									
										
										
										
											2024-09-17 19:59:02 +00:00
										 |  |  |         print(f"ti89 book length: {len(base_text):,}") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         warmup = get_document_coherency(base_text[:1000]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         base_text = base_text[:40000] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         start = time.perf_counter() | 
					
						
							|  |  |  |         score = get_document_coherency(base_text) | 
					
						
							|  |  |  |         end = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         char_per_sec = len(base_text) / (end - start) | 
					
						
							|  |  |  |         char_per_sec = char_per_sec / multiprocessing.cpu_count() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(f"ti89 book score {score:.2f}") | 
					
						
							|  |  |  |         print(f"{char_per_sec:.2f} chars per second per core") | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-17 16:58:45 +00:00
										 |  |  |     def testTwoColumnMisparse(self): | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |         pdftotext_text = get_anchor_text( | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  |             os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |             page=2, | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  |             pdf_engine="pdftotext", | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |         pymupdf_text = get_anchor_text( | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  |             os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |             page=2, | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  |             pdf_engine="pymupdf", | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |         pdfium_text = get_anchor_text( | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  |             os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), | 
					
						
							| 
									
										
										
										
											2024-10-10 16:41:19 +00:00
										 |  |  |             page=2, | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  |             pdf_engine="pdfium", | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-17 18:47:27 +00:00
										 |  |  |         # pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext") | 
					
						
							|  |  |  |         # pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf") | 
					
						
							| 
									
										
										
										
											2024-09-17 16:58:45 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-17 19:59:02 +00:00
										 |  |  |         print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text)) | 
					
						
							|  |  |  |         print("pymupdf_text", pymupdf_score := get_document_coherency(pymupdf_text)) | 
					
						
							|  |  |  |         print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         self.assertLess(pdftotext_score, pymupdf_score) | 
					
						
							| 
									
										
										
										
											2024-10-01 20:19:03 +00:00
										 |  |  |         self.assertLess(pdfium_score, pymupdf_score) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         self.assertEqual(anchor_text, pymupdf_text) |