From 42f56672b4828d0edd772b573cd4ce86a2bad0d8 Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Fri, 12 Jun 2020 11:53:18 +0200 Subject: [PATCH] Remove PyMuPDF dependency (#148) --- .../indexing/file_converters/pdftotext.py | 47 +++++++++---------- requirements.txt | 3 +- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/haystack/indexing/file_converters/pdftotext.py b/haystack/indexing/file_converters/pdftotext.py index 5413bbd7a..82cd55a0e 100644 --- a/haystack/indexing/file_converters/pdftotext.py +++ b/haystack/indexing/file_converters/pdftotext.py @@ -6,7 +6,6 @@ from itertools import chain from pathlib import Path from typing import List, Optional, Tuple, Generator, Set -import fitz import langdetect from haystack.indexing.file_converters.base import BaseConverter @@ -67,10 +66,10 @@ class PDFToTextConverter(BaseConverter): def extract_pages(self, file_path: Path) -> List[str]: - page_count = fitz.open(file_path).pageCount + pages = self._read_pdf(file_path, layout=False) - pages = [] - for page_number in range(1, page_count + 1): + cleaned_pages = [] + for page in pages: # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour # can be toggled by using the layout param. # layout=True @@ -81,7 +80,6 @@ class PDFToTextConverter(BaseConverter): # - cells of tables gets split across line # # Here, as a "safe" default, layout is turned off. - page = self._extract_page(file_path, page_number, layout=False) lines = page.splitlines() cleaned_lines = [] for line in lines: @@ -104,40 +102,41 @@ class PDFToTextConverter(BaseConverter): if self.remove_empty_lines: page = re.sub(r"\n\n+", "\n\n", page) - pages.append(page) - page_number += 1 + cleaned_pages.append(page) - document_text = "".join(pages) - if not self._validate_language(document_text): - logger.warning( - f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " - f"been decoded in the correct text format." - ) + if self.valid_languages: + document_text = "".join(cleaned_pages) + if not self._validate_language(document_text): + logger.warning( + f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " + f"been decoded in the correct text format." + ) if self.remove_header_footer: - pages, header, footer = self.find_and_remove_header_footer( - pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 + cleaned_pages, header, footer = self.find_and_remove_header_footer( + cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 ) logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") - return pages + return cleaned_pages - def _extract_page(self, file_path: Path, page_number: int, layout: bool) -> str: + def _read_pdf(self, file_path: Path, layout: bool) -> List[str]: """ - Extract a page from the pdf file at file_path. + Extract pages from the pdf file at file_path. :param file_path: path of the pdf file - :param page_number: page number to extract(starting from 1) :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in the content stream order. """ if layout: - command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), str(file_path), "-"] + command = ["pdftotext", "-layout", str(file_path), "-"] else: - command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), str(file_path), "-"] - output_page = subprocess.run(command, capture_output=True, shell=False) - page = output_page.stdout.decode(errors="ignore") - return page + command = ["pdftotext", str(file_path), "-"] + output = subprocess.run(command, capture_output=True, shell=False) + document = output.stdout.decode(errors="ignore") + pages = document.split("\f") + pages = pages[:-1] # the last page in the split is always empty. + return pages def _validate_language(self, text: str) -> bool: """ diff --git a/requirements.txt b/requirements.txt index aee6ff538..ac5b14444 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,4 @@ elastic-apm tox coverage langdetect # for PDF conversions -PyMuPDF # for PDF conversions -# optional: sentence-transformers +# optional: sentence-transformers \ No newline at end of file