diff --git a/e2e/pipelines/test_evaluation_pipeline.py b/e2e/pipelines/test_evaluation_pipeline.py index 4500f3ee1..97c5a981d 100644 --- a/e2e/pipelines/test_evaluation_pipeline.py +++ b/e2e/pipelines/test_evaluation_pipeline.py @@ -36,14 +36,14 @@ def indexing_pipeline(documents: List[Document]): doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP) doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False) ingestion_pipe = Pipeline() - ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder") # type: ignore - ingestion_pipe.add_component(instance=doc_writer, name="doc_writer") # type: ignore + ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder") + ingestion_pipe.add_component(instance=doc_writer, name="doc_writer") ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents") ingestion_pipe.run({"doc_embedder": {"documents": documents}}) return document_store -def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ignore +def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): """RAG pipeline""" template = [ ChatMessage.from_system( @@ -59,11 +59,11 @@ def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ig ), ] rag = Pipeline() - rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)) # type: ignore - rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)) # type: ignore - rag.add_component("prompt_builder", ChatPromptBuilder(template=template)) # type: ignore - rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini")) # type: ignore - rag.add_component("answer_builder", AnswerBuilder()) # type: ignore + rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)) + rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)) + rag.add_component("prompt_builder", ChatPromptBuilder(template=template)) + rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini")) + rag.add_component("answer_builder", AnswerBuilder()) rag.connect("embedder", "retriever.query_embedding") rag.connect("retriever", "prompt_builder.documents") rag.connect("prompt_builder", "generator") diff --git a/haystack/components/converters/multi_file_converter.py b/haystack/components/converters/multi_file_converter.py index 9a13fad8c..2fd57373e 100644 --- a/haystack/components/converters/multi_file_converter.py +++ b/haystack/components/converters/multi_file_converter.py @@ -80,24 +80,22 @@ class MultiFileConverter: # Create pipeline and add components pp = Pipeline() - # We use type ignore here to avoid type checking errors - # This is due to how the run method within the Component protocol is defined - pp.add_component("router", router) # type: ignore[arg-type] - pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type] + pp.add_component("router", router) + pp.add_component("docx", DOCXToDocument(link_format="markdown")) pp.add_component( "html", - HTMLToDocument( # type: ignore[arg-type] + HTMLToDocument( extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True} ), ) - pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type] - pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type] - pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type] - pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type] - pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type] - pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type] - pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type] - pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type] + pp.add_component("json", JSONConverter(content_key=self.json_content_key)) + pp.add_component("md", TextFileToDocument(encoding=self.encoding)) + pp.add_component("text", TextFileToDocument(encoding=self.encoding)) + pp.add_component("pdf", PyPDFToDocument()) + pp.add_component("pptx", PPTXToDocument()) + pp.add_component("xlsx", XLSXToDocument()) + pp.add_component("joiner", DocumentJoiner()) + pp.add_component("csv", CSVToDocument(encoding=self.encoding)) for mime_type in ConverterMimeType: pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1]) diff --git a/haystack/components/preprocessors/document_preprocessor.py b/haystack/components/preprocessors/document_preprocessor.py index 5b6fbf243..e30d8db11 100644 --- a/haystack/components/preprocessors/document_preprocessor.py +++ b/haystack/components/preprocessors/document_preprocessor.py @@ -127,10 +127,8 @@ class DocumentPreprocessor: # Build the Pipeline pp = Pipeline() - # We use type ignore here to avoid type checking errors - # This is due to how the run method within the Component protocol is defined - pp.add_component("splitter", splitter) # type: ignore[arg-type] - pp.add_component("cleaner", cleaner) # type: ignore[arg-type] + pp.add_component("splitter", splitter) + pp.add_component("cleaner", cleaner) # Connect the splitter output to cleaner pp.connect("splitter.documents", "cleaner.documents") diff --git a/haystack/core/component/component.py b/haystack/core/component/component.py index a2c31a072..707258978 100644 --- a/haystack/core/component/component.py +++ b/haystack/core/component/component.py @@ -160,12 +160,29 @@ class Component(Protocol): isinstance(MyComponent, Component) """ - # This is the most reliable way to define the protocol for the `run` method. - # Defining a method doesn't work as different Components will have different - # arguments. Even defining here a method with `**kwargs` doesn't work as the - # expected signature must be identical. - # This makes most Language Servers and type checkers happy and shows less errors. - run: Callable[..., Dict[str, Any]] + # The following expression defines a run method compatible with any input signature. + # Its type is equivalent to Callable[..., Dict[str, Any]]. + # See https://typing.python.org/en/latest/spec/callables.html#meaning-of-in-callable. + # + # Using `run: Callable[..., Dict[str, Any]]` directly leads to type errors: the protocol would expect a settable + # attribute `run`, while the actual implementation is a read-only method. + # For example: + # from haystack import Pipeline, component + # @component + # class MyComponent: + # @component.output_types(out=str) + # def run(self): + # return {"out": "Hello, world!"} + # pipeline = Pipeline() + # pipeline.add_component("my_component", MyComponent()) + # + # mypy raises: + # error: Argument 2 to "add_component" of "PipelineBase" has incompatible type "MyComponent"; expected "Component" + # [arg-type] + # note: Protocol member Component.run expected settable variable, got read-only attribute + + def run(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: # pylint: disable=missing-function-docstring # noqa: D102 + ... class ComponentMeta(type): diff --git a/haystack/py.typed b/haystack/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/releasenotes/notes/py-typed-724eea7222640e6d.yaml b/releasenotes/notes/py-typed-724eea7222640e6d.yaml new file mode 100644 index 000000000..0af15c5a2 --- /dev/null +++ b/releasenotes/notes/py-typed-724eea7222640e6d.yaml @@ -0,0 +1,9 @@ +--- +upgrade: + - | + We've added a `py.typed` file to Haystack to enable type information to be used by downstream projects, in line + with PEP 561. This means Haystack's type hints will now be visible to type checkers in projects that depend on it. + Haystack is primarily type checked using mypy (not pyright) and, despite our efforts, some type information can + be incomplete or unreliable. + If you use static type checking in your own project, you may notice some changes: previously, Haystack's types were + effectively treated as `Any`, but now actual type information will be available and enforced.