feat: introduce generalimport (#4662)

* introduce generalimport * pylint * fix optional deps typing for schema * leftover * typo * typing with faiss * make Base generation optional too * handle sqlalchemy * (almost) all import are optional * TO REMOVE hijacking CI for tests * some deps are actually needed * get feature branch in CI * get feature branch in CI * fix array_equal * pylint * pandas also required * improve imports.yml * fix SquadData * fix SquadData again * generalimport imports list * Update haystack/utils/openai_utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/utils/openai_utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * review feedback * remove todos * reference main release * pylint * circular import * review feedback * move is_imported in init * pylint --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
2025-11-14 00:54:22 +00:00 · 2023-05-08 15:20:10 +02:00 · 2023-05-08 15:20:10 +02:00 · 28260c5c3f
commit 28260c5c3f
parent 5b2ef2afd6
14 changed files with 252 additions and 135 deletions
--- a/.github/workflows/imports.yml
+++ b/.github/workflows/imports.yml
@ -37,5 +37,5 @@ jobs:
    - name: Install Haystack with no extras
      run: pip install .

-    - name: Try to import
+    - name: Import Haystack
      run: python -c 'import haystack'
--- a/haystack/init.py
+++ b/haystack/init.py
@ -1,21 +1,94 @@
-# pylint: disable=wrong-import-position,wrong-import-order
+# pylint: disable=wrong-import-position
+# Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485

-from typing import Union
-from types import ModuleType
-
-try:
+import sys
 from importlib import metadata
-except (ModuleNotFoundError, ImportError):
-    # Python <= 3.7
-    import importlib_metadata as metadata  # type: ignore

 __version__: str = str(metadata.version("farm-haystack"))

+from generalimport import generalimport, MissingOptionalDependency, FakeModule

-# Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485
-import logging
+generalimport(
+    # "pydantic", # Required for all dataclasses
+    # "tenacity",  # Probably needed because it's a decorator, to be evaluated
+    # "pandas",
+    "aiorwlock",
+    "azure",
+    "beautifulsoup4",
+    "beir",
+    "boilerpy3",
+    "canals",
+    "dill",
+    "docx",
+    "elasticsearch",
+    "events",
+    "faiss",
+    "fitz",
+    "frontmatter",
+    "huggingface_hub",
+    "jsonschema",
+    "langdetect",
+    "magic",
+    "markdown",
+    "mlflow",
+    "mmh3",
+    "more_itertools",
+    "networkx",
+    "nltk",
+    "numpy",
+    "onnxruntime",
+    "onnxruntime_tools",
+    "opensearchpy",
+    "pdf2image",
+    "PIL",
+    "pinecone",
+    "posthog",
+    "protobuf",
+    "psycopg2",
+    "pymilvus",
+    "pytesseract",
+    "quantulum3",
+    "rank_bm25",
+    "rapidfuzz",
+    "ray",
+    "rdflib",
+    "requests",
+    "scipy",
+    "selenium",
+    "sentence_transformers",
+    "seqeval",
+    "sklearn",
+    "SPARQLWrapper",
+    "sqlalchemy",
+    "sseclient",
+    "tenacity",
+    "tika",
+    "tiktoken",
+    "tokenizers",
+    "torch",
+    "tqdm",
+    "transformers",
+    "weaviate",
+    "webdriver_manager",
+    "whisper",
+    "yaml",
+)
+
+
+# TODO: remove this function once this PR is merged and released by generalimport:
+# https://github.com/ManderaGeneral/generalimport/pull/25
+def is_imported(module_name: str) -> bool:
+    """
+    Returns True if the module was actually imported, False, if generalimport mocked it.
+    """
+    module = sys.modules.get(module_name)
+    try:
+        return bool(module) and not isinstance(module, FakeModule)
+    except MissingOptionalDependency:
+        # isinstance() raises MissingOptionalDependency: fake module
+        pass
+    return False

-import pandas as pd

 from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult, TableCell
 from haystack.nodes.base import BaseComponent
@ -23,5 +96,6 @@ from haystack.pipelines.base import Pipeline
 from haystack.environment import set_pytorch_secure_model_loading


-pd.options.display.max_colwidth = 80
+# Enables torch's secure model loading through setting an env var.
+# Does not use torch.
 set_pytorch_secure_model_loading()
--- a/haystack/document_stores/faiss.py
+++ b/haystack/document_stores/faiss.py
@ -46,7 +46,7 @@ class FAISSDocumentStore(SQLDocumentStore):
        vector_dim: Optional[int] = None,
        embedding_dim: int = 768,
        faiss_index_factory_str: str = "Flat",
-        faiss_index: Optional[faiss.swigfaiss.Index] = None,
+        faiss_index: Optional["faiss.swigfaiss.Index"] = None,
        return_embedding: bool = False,
        index: str = "document",
        similarity: str = "dot_product",
--- a/haystack/document_stores/pinecone.py
+++ b/haystack/document_stores/pinecone.py
@ -58,7 +58,7 @@ class PineconeDocumentStore(BaseDocumentStore):
        self,
        api_key: str,
        environment: str = "us-west1-gcp",
-        pinecone_index: Optional[pinecone.Index] = None,
+        pinecone_index: Optional["pinecone.Index"] = None,
        embedding_dim: int = 768,
        return_embedding: bool = False,
        index: str = "document",
--- a/haystack/document_stores/sql.py
+++ b/haystack/document_stores/sql.py
@ -27,19 +27,30 @@ try:
    from sqlalchemy.ext.declarative import declarative_base
    from sqlalchemy.orm import relationship, sessionmaker, aliased
    from sqlalchemy.sql import case, null
+
 except (ImportError, ModuleNotFoundError) as ie:
    from haystack.utils.import_utils import _optional_component_not_installed

    _optional_component_not_installed(__name__, "sql", ie)

+
+from haystack import is_imported
 from haystack.schema import Document, Label, Answer
 from haystack.document_stores.base import BaseDocumentStore, FilterType
 from haystack.document_stores.filter_utils import LogicalFilterClause


-logger = logging.getLogger(__name__)
-Base = declarative_base()  # type: Any
+if not is_imported("sqlalchemy"):
+    Base = object
+    ArrayType = object
+    ORMBase = object
+    DocumentORM = object
+    MetaDocumentORM = object
+    LabelORM = object
+    MetaLabelORM = object

+else:
+    Base = declarative_base()  # type: Any

    class ArrayType(TypeDecorator):
        impl = String
@ -53,7 +64,6 @@ class ArrayType(TypeDecorator):
                return json.loads(value)
            return value

-
    class ORMBase(Base):
        __abstract__ = True

@ -61,7 +71,6 @@ class ORMBase(Base):
        created_at = Column(DateTime, server_default=func.now())
        updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now())

-
    class DocumentORM(ORMBase):
        __tablename__ = "document"

@ -75,7 +84,6 @@ class DocumentORM(ORMBase):

        __table_args__ = (UniqueConstraint("index", "vector_id", name="index_vector_id_uc"),)

-
    class MetaDocumentORM(ORMBase):
        __tablename__ = "meta_document"

@ -87,12 +95,14 @@ class MetaDocumentORM(ORMBase):
        document_index = Column(String(100), nullable=False, index=True)
        __table_args__ = (  # type: ignore
            ForeignKeyConstraint(
-            [document_id, document_index], [DocumentORM.id, DocumentORM.index], ondelete="CASCADE", onupdate="CASCADE"
+                [document_id, document_index],
+                [DocumentORM.id, DocumentORM.index],
+                ondelete="CASCADE",
+                onupdate="CASCADE",
            ),
            {},
        )

-
    class LabelORM(ORMBase):
        __tablename__ = "label"

@ -108,7 +118,6 @@ class LabelORM(ORMBase):

        meta = relationship("MetaLabelORM", back_populates="labels", lazy="joined")

-
    class MetaLabelORM(ORMBase):
        __tablename__ = "meta_label"

@ -126,6 +135,9 @@ class MetaLabelORM(ORMBase):
        )


+logger = logging.getLogger(__name__)
+
+
 class SQLDocumentStore(BaseDocumentStore):
    def __init__(
        self,
--- a/haystack/modeling/model/biadaptive_model.py
+++ b/haystack/modeling/model/biadaptive_model.py
@ -41,7 +41,7 @@ class BiAdaptiveModel(nn.Module):
        language_model2: LanguageModel,
        prediction_heads: List[PredictionHead],
        embeds_dropout_prob: float = 0.1,
-        device: torch.device = torch.device("cuda"),
+        device: Optional[torch.device] = None,
        lm1_output_types: Optional[Union[str, List[str]]] = None,
        lm2_output_types: Optional[Union[str, List[str]]] = None,
        loss_aggregation_fn: Optional[Callable] = None,
@ -74,6 +74,9 @@ class BiAdaptiveModel(nn.Module):
                                    Note: The loss at this stage is per sample, i.e one tensor of
                                    shape (batchsize) per prediction head.
        """
+        if not device:
+            device = torch.device("cuda")
+
        if lm1_output_types is None:
            lm1_output_types = ["per_sequence"]
        if lm2_output_types is None:
--- a/haystack/modeling/model/feature_extraction.py
+++ b/haystack/modeling/model/feature_extraction.py
@ -29,6 +29,7 @@ from transformers import PreTrainedTokenizer, RobertaTokenizer, AutoConfig, Auto
 from transformers.models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING_NAMES
 from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES

+from haystack import is_imported
 from haystack.errors import ModelingError
 from haystack.modeling.data_handler.samples import SampleBasket

@ -40,12 +41,16 @@ logger = logging.getLogger(__name__)
 SPECIAL_TOKENIZER_CHARS = r"^(##|Ġ|▁)"


+if not is_imported("transformers"):
+    TOKENIZER_MAPPING_NAMES = {}
+    FEATURE_EXTRACTOR_MAPPING_NAMES = {}
+
+
 FEATURE_EXTRACTORS = {
    **{key: AutoTokenizer for key in TOKENIZER_MAPPING_NAMES.keys()},
    **{key: AutoFeatureExtractor for key in FEATURE_EXTRACTOR_MAPPING_NAMES.keys()},
 }

-
 DEFAULT_EXTRACTION_PARAMS = {
    AutoTokenizer: {
        "max_length": 256,
--- a/haystack/modeling/model/triadaptive_model.py
+++ b/haystack/modeling/model/triadaptive_model.py
@ -43,7 +43,7 @@ class TriAdaptiveModel(nn.Module):
        language_model3: LanguageModel,
        prediction_heads: List[PredictionHead],
        embeds_dropout_prob: float = 0.1,
-        device: torch.device = torch.device("cuda"),
+        device: Optional[torch.device] = None,
        lm1_output_types: Optional[Union[str, List[str]]] = None,
        lm2_output_types: Optional[Union[str, List[str]]] = None,
        lm3_output_types: Optional[Union[str, List[str]]] = None,
@ -83,6 +83,9 @@ class TriAdaptiveModel(nn.Module):
                                    Note: The loss at this stage is per sample, i.e one tensor of
                                    shape (batchsize) per prediction head.
        """
+        if not device:
+            device = torch.device("cuda")
+
        if lm1_output_types is None:
            lm1_output_types = ["per_sequence"]
        if lm2_output_types is None:
--- a/haystack/nodes/_json_schema.py
+++ b/haystack/nodes/_json_schema.py
@ -55,6 +55,10 @@ def field_singleton_schema(
    known_models: TypeModelSet,
 ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
    try:
+        # Typing with optional dependencies is really tricky. Let's just use Any for now. To be fixed.
+        if isinstance(field.type_, ForwardRef):
+            logger.debug(field.type_)
+            field.type_ = Any
        return _field_singleton_schema(
            field,
            by_alias=by_alias,
@ -211,7 +215,10 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[

    # Create the model with Pydantic and extract the schema
    model = create_model(f"{node_name}ComponentParams", __config__=Config, **param_fields_kwargs)
+    try:
        model.update_forward_refs(**model.__dict__)
+    except NameError as exc:
+        logger.debug("%s", str(exc))
    params_schema = model.schema()

    # Pydantic v1 patch to generate JSON schemas including Optional fields
--- a/haystack/schema.py
+++ b/haystack/schema.py
@ -20,7 +20,9 @@ from dataclasses import asdict

 import mmh3
 import numpy as np
+from numpy import ndarray
 import pandas as pd
+from pandas import DataFrame

 from pydantic import BaseConfig, Field
 from pydantic.json import pydantic_encoder
@ -29,12 +31,19 @@ from pydantic.json import pydantic_encoder
 # See #1598 for the reasons behind this choice & performance considerations
 from pydantic.dataclasses import dataclass

+from haystack import is_imported
+

 logger = logging.getLogger(__name__)


+if not is_imported("pandas"):
+    DataFrame = object
+
+
 BaseConfig.arbitrary_types_allowed = True

+
 #: Types of content_types supported
 ContentTypes = Literal["text", "table", "image", "audio"]
 FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
@ -43,12 +52,12 @@ FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
@dataclass
 class Document:
    id: str
-    content: Union[str, pd.DataFrame]
+    content: Union[str, DataFrame]
    content_type: ContentTypes = Field(default="text")
    meta: Dict[str, Any] = Field(default={})
    id_hash_keys: List[str] = Field(default=["content"])
    score: Optional[float] = None
-    embedding: Optional[np.ndarray] = None
+    embedding: Optional[ndarray] = None

    # We use a custom init here as we want some custom logic. The annotations above are however still needed in order
    # to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
@ -56,12 +65,12 @@ class Document:
    # don't need to passed by the user in init and are rather initialized automatically in the init
    def __init__(
        self,
-        content: Union[str, pd.DataFrame],
+        content: Union[str, DataFrame],
        content_type: ContentTypes = "text",
        id: Optional[str] = None,
        score: Optional[float] = None,
        meta: Optional[Dict[str, Any]] = None,
-        embedding: Optional[np.ndarray] = None,
+        embedding: Optional[ndarray] = None,
        id_hash_keys: Optional[List[str]] = None,
    ):
        """
@ -184,7 +193,7 @@ class Document:
                continue
            if k == "content":
                # Convert pd.DataFrame to list of rows for serialization
-                if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
+                if self.content_type == "table" and isinstance(self.content, DataFrame):
                    v = dataframe_to_list(self.content)
            k = k if k not in inv_field_map else inv_field_map[k]
            _doc[k] = v
@ -230,7 +239,7 @@ class Document:
                k = field_map[k]
                _new_doc[k] = v

-        # Convert list of rows to pd.DataFrame
+        # Convert list of rows to DataFrame
        if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
            _new_doc["content"] = dataframe_from_list(_new_doc["content"])

@ -358,7 +367,7 @@ class Answer:
    answer: str
    type: Literal["generative", "extractive", "other"] = "extractive"
    score: Optional[float] = None
-    context: Optional[Union[str, pd.DataFrame]] = None
+    context: Optional[Union[str, DataFrame]] = None
    offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None
    offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None
    document_ids: Optional[List[str]] = None
@ -832,7 +841,7 @@ def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:


 class EvaluationResult:
-    def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None:
+    def __init__(self, node_results: Optional[Dict[str, DataFrame]] = None) -> None:
        """
        A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`).
        Detailed results are stored as one dataframe per node. This class makes them more accessible and provides
@ -902,7 +911,7 @@ class EvaluationResult:

        :param node_results: The evaluation Dataframes per pipeline node.
        """
-        self.node_results: Dict[str, pd.DataFrame] = {} if node_results is None else node_results
+        self.node_results: Dict[str, DataFrame] = {} if node_results is None else node_results

    def __getitem__(self, key: str):
        return self.node_results.__getitem__(key)
@ -910,7 +919,7 @@ class EvaluationResult:
    def __delitem__(self, key: str):
        self.node_results.__delitem__(key)

-    def __setitem__(self, key: str, value: pd.DataFrame):
+    def __setitem__(self, key: str, value: DataFrame):
        self.node_results.__setitem__(key, value)

    def __contains__(self, key: str):
@ -919,7 +928,7 @@ class EvaluationResult:
    def __len__(self):
        return self.node_results.__len__()

-    def append(self, key: str, value: pd.DataFrame):
+    def append(self, key: str, value: DataFrame):
        if value is not None and len(value) > 0:
            if key in self.node_results:
                self.node_results[key] = pd.concat([self.node_results[key], value])
@ -1210,7 +1219,7 @@ class EvaluationResult:

    def _calculate_node_metrics(
        self,
-        df: pd.DataFrame,
+        df: DataFrame,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        document_scope: Literal[
@ -1244,7 +1253,7 @@ class EvaluationResult:

        return {**answer_metrics, **document_metrics}

-    def _filter_eval_mode(self, df: pd.DataFrame, eval_mode: str) -> pd.DataFrame:
+    def _filter_eval_mode(self, df: DataFrame, eval_mode: str) -> DataFrame:
        if "eval_mode" in df.columns:
            df = df[df["eval_mode"] == eval_mode]
        else:
@ -1253,7 +1262,7 @@ class EvaluationResult:

    def _calculate_answer_metrics(
        self,
-        df: pd.DataFrame,
+        df: DataFrame,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
@ -1275,11 +1284,11 @@ class EvaluationResult:

    def _build_answer_metrics_df(
        self,
-        answers: pd.DataFrame,
+        answers: DataFrame,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
-    ) -> pd.DataFrame:
+    ) -> DataFrame:
        """
        Builds a dataframe containing answer metrics (columns) per multilabel (index).
        Answer metrics are:
@ -1335,7 +1344,7 @@ class EvaluationResult:
            }
            df_records.append(query_metrics)

-        metrics_df = pd.DataFrame.from_records(df_records, index=multilabel_ids)
+        metrics_df = DataFrame.from_records(df_records, index=multilabel_ids)
        return metrics_df

    def _get_documents_df(self):
@ -1350,7 +1359,7 @@ class EvaluationResult:

    def _calculate_document_metrics(
        self,
-        df: pd.DataFrame,
+        df: DataFrame,
        simulated_top_k_retriever: int = -1,
        document_relevance_criterion: Literal[
            "document_id",
@ -1378,7 +1387,7 @@ class EvaluationResult:

    def _build_document_metrics_df(
        self,
-        documents: pd.DataFrame,
+        documents: DataFrame,
        simulated_top_k_retriever: int = -1,
        document_relevance_criterion: Literal[
            "document_id",
@ -1391,7 +1400,7 @@ class EvaluationResult:
            "document_id_and_context_and_answer",
            "document_id_or_answer",
        ] = "document_id_or_answer",
-    ) -> pd.DataFrame:
+    ) -> DataFrame:
        """
        Builds a dataframe containing document metrics (columns) per pair of query and gold document ids (index).
        Document metrics are:
@ -1539,7 +1548,7 @@ class EvaluationResult:
                }
            )

-        metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
+        metrics_df = DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
        return metrics_df

    def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
@ -1548,8 +1557,8 @@ class EvaluationResult:
        The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.

        :param out_dir: Path to the target folder the csvs will be saved.
-        :param to_csv_kwargs: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
-                        This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
+        :param to_csv_kwargs: kwargs to be passed to DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
+                        This method uses different default values than DataFrame.to_csv() for the following parameters:
                        index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
        """
        out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
--- a/haystack/utils/import_utils.py
+++ b/haystack/utils/import_utils.py
@ -9,6 +9,7 @@ from pathlib import Path
 from typing import Optional, Dict, Union, Tuple, List

 import requests
+
 from haystack.errors import DatasetsError
 from haystack.schema import Document

--- a/haystack/utils/openai_utils.py
+++ b/haystack/utils/openai_utils.py
@ -6,7 +6,7 @@ import sys
 import json
 from typing import Dict, Union, Tuple, Optional, List
 import requests
-from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt
+import tenacity
 from transformers import GPT2TokenizerFast

 from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError
@ -127,10 +127,10 @@ def _openai_text_completion_tokenization_details(model_name: str):
    return tokenizer_name, max_tokens_limit


-@retry(
-    retry=retry_if_exception_type(OpenAIRateLimitError),
-    wait=wait_exponential(multiplier=OPENAI_BACKOFF),
-    stop=stop_after_attempt(OPENAI_MAX_RETRIES),
+@tenacity.retry(
+    retry=tenacity.retry_if_exception_type(OpenAIRateLimitError),
+    wait=tenacity.wait_exponential(multiplier=OPENAI_BACKOFF),
+    stop=tenacity.stop_after_attempt(OPENAI_MAX_RETRIES),
 )
 def openai_request(
    url: str,
--- a/haystack/utils/squad_data.py
+++ b/haystack/utils/squad_data.py
@ -7,6 +7,7 @@ import pandas as pd
 from tqdm.auto import tqdm
 import mmh3

+from haystack import is_imported
 from haystack.schema import Document, Label, Answer
 from haystack.modeling.data_handler.processor import _read_squad_file

@ -14,6 +15,7 @@ from haystack.modeling.data_handler.processor import _read_squad_file
 logger = logging.getLogger(__name__)


+if is_imported("pandas") and is_imported("tqdm"):
    tqdm.pandas()


--- a/pyproject.toml
+++ b/pyproject.toml
@ -53,6 +53,7 @@ dependencies = [
  "pandas",
  "rank_bm25",
  "scikit-learn>=1.0.0", # TF-IDF, SklearnQueryClassifier and metrics
+  "generalimport", # Optional imports

  # Utils
  "dill",  # pickle extension for (de-)serialization