Remove PyMuPDF dependency (#148)

This commit is contained in:
Tanay Soni 2020-06-12 11:53:18 +02:00 committed by GitHub
parent 997aafcf62
commit 42f56672b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 26 deletions

View File

@ -6,7 +6,6 @@ from itertools import chain
from pathlib import Path from pathlib import Path
from typing import List, Optional, Tuple, Generator, Set from typing import List, Optional, Tuple, Generator, Set
import fitz
import langdetect import langdetect
from haystack.indexing.file_converters.base import BaseConverter from haystack.indexing.file_converters.base import BaseConverter
@ -67,10 +66,10 @@ class PDFToTextConverter(BaseConverter):
def extract_pages(self, file_path: Path) -> List[str]: def extract_pages(self, file_path: Path) -> List[str]:
page_count = fitz.open(file_path).pageCount pages = self._read_pdf(file_path, layout=False)
pages = [] cleaned_pages = []
for page_number in range(1, page_count + 1): for page in pages:
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
# can be toggled by using the layout param. # can be toggled by using the layout param.
# layout=True # layout=True
@ -81,7 +80,6 @@ class PDFToTextConverter(BaseConverter):
# - cells of tables gets split across line # - cells of tables gets split across line
# #
# Here, as a "safe" default, layout is turned off. # Here, as a "safe" default, layout is turned off.
page = self._extract_page(file_path, page_number, layout=False)
lines = page.splitlines() lines = page.splitlines()
cleaned_lines = [] cleaned_lines = []
for line in lines: for line in lines:
@ -104,40 +102,41 @@ class PDFToTextConverter(BaseConverter):
if self.remove_empty_lines: if self.remove_empty_lines:
page = re.sub(r"\n\n+", "\n\n", page) page = re.sub(r"\n\n+", "\n\n", page)
pages.append(page) cleaned_pages.append(page)
page_number += 1
document_text = "".join(pages) if self.valid_languages:
if not self._validate_language(document_text): document_text = "".join(cleaned_pages)
logger.warning( if not self._validate_language(document_text):
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " logger.warning(
f"been decoded in the correct text format." f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
) f"been decoded in the correct text format."
)
if self.remove_header_footer: if self.remove_header_footer:
pages, header, footer = self.find_and_remove_header_footer( cleaned_pages, header, footer = self.find_and_remove_header_footer(
pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
) )
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
return pages return cleaned_pages
def _extract_page(self, file_path: Path, page_number: int, layout: bool) -> str: def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
""" """
Extract a page from the pdf file at file_path. Extract pages from the pdf file at file_path.
:param file_path: path of the pdf file :param file_path: path of the pdf file
:param page_number: page number to extract(starting from 1)
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
the content stream order. the content stream order.
""" """
if layout: if layout:
command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), str(file_path), "-"] command = ["pdftotext", "-layout", str(file_path), "-"]
else: else:
command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), str(file_path), "-"] command = ["pdftotext", str(file_path), "-"]
output_page = subprocess.run(command, capture_output=True, shell=False) output = subprocess.run(command, capture_output=True, shell=False)
page = output_page.stdout.decode(errors="ignore") document = output.stdout.decode(errors="ignore")
return page pages = document.split("\f")
pages = pages[:-1] # the last page in the split is always empty.
return pages
def _validate_language(self, text: str) -> bool: def _validate_language(self, text: str) -> bool:
""" """

View File

@ -10,5 +10,4 @@ elastic-apm
tox tox
coverage coverage
langdetect # for PDF conversions langdetect # for PDF conversions
PyMuPDF # for PDF conversions
# optional: sentence-transformers # optional: sentence-transformers