| 
									
										
										
										
											2024-09-17 15:16:58 +00:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  | import unittest | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from pypdf import PdfReader | 
					
						
							| 
									
										
										
										
											2024-09-17 15:16:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-27 18:30:41 +00:00
										 |  |  | from olmocr.filter import PdfFilter | 
					
						
							| 
									
										
										
										
											2024-09-17 16:26:55 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-17 15:16:58 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | class PdfFilterTest(unittest.TestCase): | 
					
						
							|  |  |  |     def testFormLaterPages(self): | 
					
						
							| 
									
										
										
										
											2024-10-17 22:36:38 +00:00
										 |  |  |         self.filter = PdfFilter(apply_form_check=True) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         self.assertTrue(self.filter.filter_out_pdf(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))) | 
					
						
							| 
									
										
										
										
											2024-10-17 22:36:38 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         self.filter = PdfFilter(apply_form_check=False) | 
					
						
							| 
									
										
										
										
											2024-09-18 22:52:42 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         self.assertFalse(self.filter.filter_out_pdf(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))) |