feat: add py.typed; adjust Component protocol (#9329)

* experimenting with py.typed

* try changing run method in protocol

* Trigger Build

* better docstring + release note

* remove type:ignore where possible

* Removed a few more type: ignores

---------

Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com>
This commit is contained in:
Stefano Fiorucci 2025-05-07 09:34:31 +02:00 committed by GitHub
parent 4ce6934dd9
commit de5c7ea3d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 53 additions and 31 deletions

View File

@ -36,14 +36,14 @@ def indexing_pipeline(documents: List[Document]):
doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)
ingestion_pipe = Pipeline()
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder") # type: ignore
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer") # type: ignore
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
ingestion_pipe.run({"doc_embedder": {"documents": documents}})
return document_store
def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ignore
def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):
"""RAG pipeline"""
template = [
ChatMessage.from_system(
@ -59,11 +59,11 @@ def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ig
),
]
rag = Pipeline()
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)) # type: ignore
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)) # type: ignore
rag.add_component("prompt_builder", ChatPromptBuilder(template=template)) # type: ignore
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini")) # type: ignore
rag.add_component("answer_builder", AnswerBuilder()) # type: ignore
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))
rag.add_component("prompt_builder", ChatPromptBuilder(template=template))
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))
rag.add_component("answer_builder", AnswerBuilder())
rag.connect("embedder", "retriever.query_embedding")
rag.connect("retriever", "prompt_builder.documents")
rag.connect("prompt_builder", "generator")

View File

@ -80,24 +80,22 @@ class MultiFileConverter:
# Create pipeline and add components
pp = Pipeline()
# We use type ignore here to avoid type checking errors
# This is due to how the run method within the Component protocol is defined
pp.add_component("router", router) # type: ignore[arg-type]
pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type]
pp.add_component("router", router)
pp.add_component("docx", DOCXToDocument(link_format="markdown"))
pp.add_component(
"html",
HTMLToDocument( # type: ignore[arg-type]
HTMLToDocument(
extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
),
)
pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type]
pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type]
pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type]
pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type]
pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type]
pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("json", JSONConverter(content_key=self.json_content_key))
pp.add_component("md", TextFileToDocument(encoding=self.encoding))
pp.add_component("text", TextFileToDocument(encoding=self.encoding))
pp.add_component("pdf", PyPDFToDocument())
pp.add_component("pptx", PPTXToDocument())
pp.add_component("xlsx", XLSXToDocument())
pp.add_component("joiner", DocumentJoiner())
pp.add_component("csv", CSVToDocument(encoding=self.encoding))
for mime_type in ConverterMimeType:
pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])

View File

@ -127,10 +127,8 @@ class DocumentPreprocessor:
# Build the Pipeline
pp = Pipeline()
# We use type ignore here to avoid type checking errors
# This is due to how the run method within the Component protocol is defined
pp.add_component("splitter", splitter) # type: ignore[arg-type]
pp.add_component("cleaner", cleaner) # type: ignore[arg-type]
pp.add_component("splitter", splitter)
pp.add_component("cleaner", cleaner)
# Connect the splitter output to cleaner
pp.connect("splitter.documents", "cleaner.documents")

View File

@ -160,12 +160,29 @@ class Component(Protocol):
isinstance(MyComponent, Component)
"""
# This is the most reliable way to define the protocol for the `run` method.
# Defining a method doesn't work as different Components will have different
# arguments. Even defining here a method with `**kwargs` doesn't work as the
# expected signature must be identical.
# This makes most Language Servers and type checkers happy and shows less errors.
run: Callable[..., Dict[str, Any]]
# The following expression defines a run method compatible with any input signature.
# Its type is equivalent to Callable[..., Dict[str, Any]].
# See https://typing.python.org/en/latest/spec/callables.html#meaning-of-in-callable.
#
# Using `run: Callable[..., Dict[str, Any]]` directly leads to type errors: the protocol would expect a settable
# attribute `run`, while the actual implementation is a read-only method.
# For example:
# from haystack import Pipeline, component
# @component
# class MyComponent:
# @component.output_types(out=str)
# def run(self):
# return {"out": "Hello, world!"}
# pipeline = Pipeline()
# pipeline.add_component("my_component", MyComponent())
#
# mypy raises:
# error: Argument 2 to "add_component" of "PipelineBase" has incompatible type "MyComponent"; expected "Component"
# [arg-type]
# note: Protocol member Component.run expected settable variable, got read-only attribute
def run(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: # pylint: disable=missing-function-docstring # noqa: D102
...
class ComponentMeta(type):

0
haystack/py.typed Normal file
View File

View File

@ -0,0 +1,9 @@
---
upgrade:
- |
We've added a `py.typed` file to Haystack to enable type information to be used by downstream projects, in line
with PEP 561. This means Haystack's type hints will now be visible to type checkers in projects that depend on it.
Haystack is primarily type checked using mypy (not pyright) and, despite our efforts, some type information can
be incomplete or unreliable.
If you use static type checking in your own project, you may notice some changes: previously, Haystack's types were
effectively treated as `Any`, but now actual type information will be available and enforced.