diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ae0866fbb..d87fb7a22 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -240,7 +240,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,docstores] + run: pip install .[dev,preprocessing] - name: Run tests run: | @@ -318,7 +318,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,sql] + run: pip install .[dev,sql,preprocessing] - name: Run tests run: | @@ -404,7 +404,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,docstores] + run: pip install .[dev,opensearch,preprocessing] - name: Run tests run: | @@ -482,7 +482,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev] + run: pip install .[dev,preprocessing] - name: Run tests run: | @@ -560,7 +560,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[faiss,dev] + run: pip install .[faiss,dev,preprocessing] - name: Run tests run: | @@ -648,7 +648,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,docstores] + run: pip install .[dev,weaviate,preprocessing] - name: Run tests run: | @@ -726,7 +726,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,pinecone] + run: pip install .[dev,pinecone,preprocessing] - name: Run tests env: @@ -805,7 +805,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev] + run: pip install .[dev,preprocessing] - name: Run tests run: | @@ -865,7 +865,6 @@ jobs: } - integration-tests-promptnode: name: Integration / PromptNode / ${{ matrix.os }} needs: @@ -883,7 +882,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev] + run: pip install .[dev,preprocessing] - name: Run tests run: | @@ -961,7 +960,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev] + run: pip install .[dev,preprocessing] - name: Run tests run: | diff --git a/docker/docker-bake.hcl b/docker/docker-bake.hcl index 005939610..719cd5ab1 100644 --- a/docker/docker-bake.hcl +++ b/docker/docker-bake.hcl @@ -45,7 +45,7 @@ target "base-cpu" { build_image = "python:3.10-slim" base_image = "python:3.10-slim" haystack_version = "${HAYSTACK_VERSION}" - haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,metrics,beir]" + haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,file-conversion,ocr,onnx,metrics,beir]" } platforms = ["linux/amd64", "linux/arm64"] } @@ -59,7 +59,7 @@ target "base-gpu" { build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime" base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime" haystack_version = "${HAYSTACK_VERSION}" - haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu,metrics]" + haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,file-conversion,ocr,onnx-gpu,metrics]" } platforms = ["linux/amd64", "linux/arm64"] } diff --git a/haystack/nodes/doc_language_classifier/langdetect.py b/haystack/nodes/doc_language_classifier/langdetect.py index 625f2e58f..2711c7ca0 100644 --- a/haystack/nodes/doc_language_classifier/langdetect.py +++ b/haystack/nodes/doc_language_classifier/langdetect.py @@ -1,7 +1,6 @@ import logging from typing import List, Optional -from langdetect import LangDetectException, detect from haystack.nodes.base import Document from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClassifier @@ -9,6 +8,16 @@ from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClas logger = logging.getLogger(__name__) +try: + import langdetect +except ImportError as exc: + logger.debug( + "langdetect could not be imported. " + "Run 'pip install farm-haystack[file-conversion]' or 'pip install langdetect' to fix this issue." + ) + langdetect = None + + class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier): """ A node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for classifying the language of documents. @@ -47,8 +56,14 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier): def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None): """ :param route_by_language: Sends Documents to a different output edge depending on their language. - :param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). + :param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see + [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). """ + if not langdetect: + raise ImportError( + "langdetect could not be imported. " + "Run 'pip install farm-haystack[file-conversion]' or 'pip install langdetect' to fix this issue." + ) super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route) def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]: @@ -69,8 +84,8 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier): documents_with_language = [] for document in documents: try: - language = detect(document.content) - except LangDetectException: + language = langdetect.detect(document.content) + except langdetect.LangDetectException: logger.warning("Langdetect cannot detect the language of document: %s", document) language = None document.meta["language"] = language diff --git a/haystack/nodes/file_converter/docx.py b/haystack/nodes/file_converter/docx.py index e0bdc5b22..f4d9d80e1 100644 --- a/haystack/nodes/file_converter/docx.py +++ b/haystack/nodes/file_converter/docx.py @@ -2,7 +2,6 @@ from typing import Dict, Optional, List import logging from pathlib import Path -import docx from haystack.nodes.file_converter.base import BaseConverter from haystack.schema import Document @@ -11,7 +10,36 @@ from haystack.schema import Document logger = logging.getLogger(__name__) +try: + import docx +except ImportError as exc: + logger.debug( + "docx could not be imported. " + "Run 'pip install farm-haystack[file-conversion]' or 'pip install python-docx' to fix this issue." + ) + docx = None + + class DocxToTextConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + progress_bar: bool = True, + ): + if not docx: + raise ImportError( + "docx could not be imported. " + "Run 'pip install farm-haystack[file-conversion]' or 'pip install python-docx' to fix this issue." + ) + super().__init__( + remove_numeric_tables=remove_numeric_tables, + valid_languages=valid_languages, + id_hash_keys=id_hash_keys, + progress_bar=progress_bar, + ) + def convert( self, file_path: Path, diff --git a/haystack/nodes/file_converter/tika.py b/haystack/nodes/file_converter/tika.py index bef057df9..eda41bd73 100644 --- a/haystack/nodes/file_converter/tika.py +++ b/haystack/nodes/file_converter/tika.py @@ -7,13 +7,24 @@ import subprocess from html.parser import HTMLParser import requests -from tika import parser as tikaparser from haystack.nodes.file_converter.base import BaseConverter from haystack.schema import Document logger = logging.getLogger(__name__) + + +try: + from tika import parser as tika_parser +except ImportError as exc: + logger.debug( + "tika could not be imported. " + "Run 'pip install farm-haystack[file-conversion]' or 'pip install tika' to fix this issue." + ) + tika_parser = None + + TIKA_CONTAINER_NAME = "tika" @@ -96,6 +107,11 @@ class TikaConverter(BaseConverter): as a float, or a :ref:`(connect timeout, read timeout) ` tuple. Defaults to 10 seconds. """ + if not tika_parser: + raise ImportError( + "tika could not be imported. " + "Run 'pip install farm-haystack[file-conversion]' or 'pip install tika' to fix this issue." + ) super().__init__( remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys ) @@ -146,7 +162,7 @@ class TikaConverter(BaseConverter): if id_hash_keys is None: id_hash_keys = self.id_hash_keys - parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True) + parsed = tika_parser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True) parser = TikaXHTMLParser() parser.feed(parsed["content"]) diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 935f60372..11f64e3ae 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -1,21 +1,16 @@ +from typing import List, Optional, Generator, Set, Union, Tuple, Dict, Literal + import logging import re from copy import deepcopy from functools import partial, reduce from itertools import chain -from typing import List, Optional, Generator, Set, Union, Tuple, Dict - -try: - from typing import Literal -except ImportError: - from typing_extensions import Literal # type: ignore import warnings from pathlib import Path from pickle import UnpicklingError -import nltk -from more_itertools import windowed from tqdm.auto import tqdm +from more_itertools import windowed from haystack.nodes.preprocessor.base import BasePreProcessor from haystack.errors import HaystackError @@ -25,6 +20,16 @@ from haystack.schema import Document logger = logging.getLogger(__name__) +try: + import nltk +except ImportError as exc: + logger.debug( + "nltk could not be imported. " + "Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk' to fix this issue." + ) + nltk = None + + iso639_to_nltk = { "ru": "russian", "sl": "slovene", @@ -105,10 +110,12 @@ class PreProcessor(BasePreProcessor): super().__init__() try: - nltk.data.find("tokenizers/punkt") + if nltk: + nltk.data.find("tokenizers/punkt") except LookupError: try: - nltk.download("punkt") + if nltk: + nltk.download("punkt") except FileExistsError as error: logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error) pass @@ -805,7 +812,12 @@ class PreProcessor(BasePreProcessor): sentences = sentence_tokenizer.tokenize(text) return sentences - def _load_sentence_tokenizer(self, language_name: Optional[str]) -> nltk.tokenize.punkt.PunktSentenceTokenizer: + def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokenize.punkt.PunktSentenceTokenizer": + if not nltk: + raise ImportError( + "nltk could not be imported. " + "Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk' to fix this issue." + ) # Try to load a custom model from 'tokenizer_model_path' if self.tokenizer_model_folder is not None: tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle" diff --git a/pyproject.toml b/pyproject.toml index d757ee6f6..c38a17471 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,6 @@ dependencies = [ "pydantic", "transformers[torch]==4.25.1", "protobuf<=3.20.2", # same version they use in transformers[sentencepiece] - "nltk", "pandas", "rank_bm25", "scikit-learn>=1.0.0", # TF-IDF, SklearnQueryClassifier and metrics @@ -67,16 +66,11 @@ dependencies = [ "huggingface-hub>=0.5.0", "tenacity", # retry decorator "sseclient-py", # server side events for OpenAI streaming + "more_itertools", # utilities # Web Retriever "boilerpy3", - # Preprocessing - "more_itertools", # for windowing - "python-docx", - "langdetect", # for PDF conversions - "tika", # Apache Tika (text & metadata extractor) - # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder "sentence-transformers>=2.2.0", @@ -157,6 +151,12 @@ crawler = [ "webdriver-manager", ] preprocessing = [ + "nltk", + "langdetect", # for language classification +] +file-conversion = [ + "python-docx", + "tika", # Apache Tika (text & metadata extractor) "beautifulsoup4", "markdown", "python-frontmatter", @@ -224,11 +224,11 @@ formatting = [ ] all = [ - "farm-haystack[docstores,audio,crawler,preprocessing,pdf,ocr,ray,dev,onnx,beir,metrics]", + "farm-haystack[docstores,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,dev,onnx,beir,metrics]", ] all-gpu = [ # beir is incompatible with faiss-gpu: https://github.com/beir-cellar/beir/issues/71 - "farm-haystack[docstores-gpu,audio,crawler,preprocessing,pdf,ocr,ray,dev,onnx-gpu,metrics]", + "farm-haystack[docstores-gpu,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,dev,onnx-gpu,metrics]", ] [project.scripts]