mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-30 17:29:29 +00:00
Remove PyMuPDF dependency (#148)
This commit is contained in:
parent
997aafcf62
commit
42f56672b4
@ -6,7 +6,6 @@ from itertools import chain
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Tuple, Generator, Set
|
from typing import List, Optional, Tuple, Generator, Set
|
||||||
|
|
||||||
import fitz
|
|
||||||
import langdetect
|
import langdetect
|
||||||
|
|
||||||
from haystack.indexing.file_converters.base import BaseConverter
|
from haystack.indexing.file_converters.base import BaseConverter
|
||||||
@ -67,10 +66,10 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
|
|
||||||
def extract_pages(self, file_path: Path) -> List[str]:
|
def extract_pages(self, file_path: Path) -> List[str]:
|
||||||
|
|
||||||
page_count = fitz.open(file_path).pageCount
|
pages = self._read_pdf(file_path, layout=False)
|
||||||
|
|
||||||
pages = []
|
cleaned_pages = []
|
||||||
for page_number in range(1, page_count + 1):
|
for page in pages:
|
||||||
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
|
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
|
||||||
# can be toggled by using the layout param.
|
# can be toggled by using the layout param.
|
||||||
# layout=True
|
# layout=True
|
||||||
@ -81,7 +80,6 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
# - cells of tables gets split across line
|
# - cells of tables gets split across line
|
||||||
#
|
#
|
||||||
# Here, as a "safe" default, layout is turned off.
|
# Here, as a "safe" default, layout is turned off.
|
||||||
page = self._extract_page(file_path, page_number, layout=False)
|
|
||||||
lines = page.splitlines()
|
lines = page.splitlines()
|
||||||
cleaned_lines = []
|
cleaned_lines = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -104,40 +102,41 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
if self.remove_empty_lines:
|
if self.remove_empty_lines:
|
||||||
page = re.sub(r"\n\n+", "\n\n", page)
|
page = re.sub(r"\n\n+", "\n\n", page)
|
||||||
|
|
||||||
pages.append(page)
|
cleaned_pages.append(page)
|
||||||
page_number += 1
|
|
||||||
|
|
||||||
document_text = "".join(pages)
|
if self.valid_languages:
|
||||||
if not self._validate_language(document_text):
|
document_text = "".join(cleaned_pages)
|
||||||
logger.warning(
|
if not self._validate_language(document_text):
|
||||||
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
logger.warning(
|
||||||
f"been decoded in the correct text format."
|
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
||||||
)
|
f"been decoded in the correct text format."
|
||||||
|
)
|
||||||
|
|
||||||
if self.remove_header_footer:
|
if self.remove_header_footer:
|
||||||
pages, header, footer = self.find_and_remove_header_footer(
|
cleaned_pages, header, footer = self.find_and_remove_header_footer(
|
||||||
pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
|
cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
|
||||||
)
|
)
|
||||||
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||||
|
|
||||||
return pages
|
return cleaned_pages
|
||||||
|
|
||||||
def _extract_page(self, file_path: Path, page_number: int, layout: bool) -> str:
|
def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Extract a page from the pdf file at file_path.
|
Extract pages from the pdf file at file_path.
|
||||||
|
|
||||||
:param file_path: path of the pdf file
|
:param file_path: path of the pdf file
|
||||||
:param page_number: page number to extract(starting from 1)
|
|
||||||
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
|
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
|
||||||
the content stream order.
|
the content stream order.
|
||||||
"""
|
"""
|
||||||
if layout:
|
if layout:
|
||||||
command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), str(file_path), "-"]
|
command = ["pdftotext", "-layout", str(file_path), "-"]
|
||||||
else:
|
else:
|
||||||
command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), str(file_path), "-"]
|
command = ["pdftotext", str(file_path), "-"]
|
||||||
output_page = subprocess.run(command, capture_output=True, shell=False)
|
output = subprocess.run(command, capture_output=True, shell=False)
|
||||||
page = output_page.stdout.decode(errors="ignore")
|
document = output.stdout.decode(errors="ignore")
|
||||||
return page
|
pages = document.split("\f")
|
||||||
|
pages = pages[:-1] # the last page in the split is always empty.
|
||||||
|
return pages
|
||||||
|
|
||||||
def _validate_language(self, text: str) -> bool:
|
def _validate_language(self, text: str) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -10,5 +10,4 @@ elastic-apm
|
|||||||
tox
|
tox
|
||||||
coverage
|
coverage
|
||||||
langdetect # for PDF conversions
|
langdetect # for PDF conversions
|
||||||
PyMuPDF # for PDF conversions
|
|
||||||
# optional: sentence-transformers
|
# optional: sentence-transformers
|
||||||
Loading…
x
Reference in New Issue
Block a user