mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 12:37:27 +00:00
feat: add py.typed; adjust Component protocol (#9329)
* experimenting with py.typed * try changing run method in protocol * Trigger Build * better docstring + release note * remove type:ignore where possible * Removed a few more type: ignores --------- Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com>
This commit is contained in:
parent
4ce6934dd9
commit
de5c7ea3d2
@ -36,14 +36,14 @@ def indexing_pipeline(documents: List[Document]):
|
||||
doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
|
||||
doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)
|
||||
ingestion_pipe = Pipeline()
|
||||
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder") # type: ignore
|
||||
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer") # type: ignore
|
||||
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
|
||||
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
|
||||
ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
|
||||
ingestion_pipe.run({"doc_embedder": {"documents": documents}})
|
||||
return document_store
|
||||
|
||||
|
||||
def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ignore
|
||||
def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):
|
||||
"""RAG pipeline"""
|
||||
template = [
|
||||
ChatMessage.from_system(
|
||||
@ -59,11 +59,11 @@ def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ig
|
||||
),
|
||||
]
|
||||
rag = Pipeline()
|
||||
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)) # type: ignore
|
||||
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)) # type: ignore
|
||||
rag.add_component("prompt_builder", ChatPromptBuilder(template=template)) # type: ignore
|
||||
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini")) # type: ignore
|
||||
rag.add_component("answer_builder", AnswerBuilder()) # type: ignore
|
||||
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))
|
||||
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))
|
||||
rag.add_component("prompt_builder", ChatPromptBuilder(template=template))
|
||||
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))
|
||||
rag.add_component("answer_builder", AnswerBuilder())
|
||||
rag.connect("embedder", "retriever.query_embedding")
|
||||
rag.connect("retriever", "prompt_builder.documents")
|
||||
rag.connect("prompt_builder", "generator")
|
||||
|
||||
@ -80,24 +80,22 @@ class MultiFileConverter:
|
||||
# Create pipeline and add components
|
||||
pp = Pipeline()
|
||||
|
||||
# We use type ignore here to avoid type checking errors
|
||||
# This is due to how the run method within the Component protocol is defined
|
||||
pp.add_component("router", router) # type: ignore[arg-type]
|
||||
pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type]
|
||||
pp.add_component("router", router)
|
||||
pp.add_component("docx", DOCXToDocument(link_format="markdown"))
|
||||
pp.add_component(
|
||||
"html",
|
||||
HTMLToDocument( # type: ignore[arg-type]
|
||||
HTMLToDocument(
|
||||
extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
|
||||
),
|
||||
)
|
||||
pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type]
|
||||
pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
|
||||
pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
|
||||
pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type]
|
||||
pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type]
|
||||
pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type]
|
||||
pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type]
|
||||
pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type]
|
||||
pp.add_component("json", JSONConverter(content_key=self.json_content_key))
|
||||
pp.add_component("md", TextFileToDocument(encoding=self.encoding))
|
||||
pp.add_component("text", TextFileToDocument(encoding=self.encoding))
|
||||
pp.add_component("pdf", PyPDFToDocument())
|
||||
pp.add_component("pptx", PPTXToDocument())
|
||||
pp.add_component("xlsx", XLSXToDocument())
|
||||
pp.add_component("joiner", DocumentJoiner())
|
||||
pp.add_component("csv", CSVToDocument(encoding=self.encoding))
|
||||
|
||||
for mime_type in ConverterMimeType:
|
||||
pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])
|
||||
|
||||
@ -127,10 +127,8 @@ class DocumentPreprocessor:
|
||||
# Build the Pipeline
|
||||
pp = Pipeline()
|
||||
|
||||
# We use type ignore here to avoid type checking errors
|
||||
# This is due to how the run method within the Component protocol is defined
|
||||
pp.add_component("splitter", splitter) # type: ignore[arg-type]
|
||||
pp.add_component("cleaner", cleaner) # type: ignore[arg-type]
|
||||
pp.add_component("splitter", splitter)
|
||||
pp.add_component("cleaner", cleaner)
|
||||
|
||||
# Connect the splitter output to cleaner
|
||||
pp.connect("splitter.documents", "cleaner.documents")
|
||||
|
||||
@ -160,12 +160,29 @@ class Component(Protocol):
|
||||
isinstance(MyComponent, Component)
|
||||
"""
|
||||
|
||||
# This is the most reliable way to define the protocol for the `run` method.
|
||||
# Defining a method doesn't work as different Components will have different
|
||||
# arguments. Even defining here a method with `**kwargs` doesn't work as the
|
||||
# expected signature must be identical.
|
||||
# This makes most Language Servers and type checkers happy and shows less errors.
|
||||
run: Callable[..., Dict[str, Any]]
|
||||
# The following expression defines a run method compatible with any input signature.
|
||||
# Its type is equivalent to Callable[..., Dict[str, Any]].
|
||||
# See https://typing.python.org/en/latest/spec/callables.html#meaning-of-in-callable.
|
||||
#
|
||||
# Using `run: Callable[..., Dict[str, Any]]` directly leads to type errors: the protocol would expect a settable
|
||||
# attribute `run`, while the actual implementation is a read-only method.
|
||||
# For example:
|
||||
# from haystack import Pipeline, component
|
||||
# @component
|
||||
# class MyComponent:
|
||||
# @component.output_types(out=str)
|
||||
# def run(self):
|
||||
# return {"out": "Hello, world!"}
|
||||
# pipeline = Pipeline()
|
||||
# pipeline.add_component("my_component", MyComponent())
|
||||
#
|
||||
# mypy raises:
|
||||
# error: Argument 2 to "add_component" of "PipelineBase" has incompatible type "MyComponent"; expected "Component"
|
||||
# [arg-type]
|
||||
# note: Protocol member Component.run expected settable variable, got read-only attribute
|
||||
|
||||
def run(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: # pylint: disable=missing-function-docstring # noqa: D102
|
||||
...
|
||||
|
||||
|
||||
class ComponentMeta(type):
|
||||
|
||||
0
haystack/py.typed
Normal file
0
haystack/py.typed
Normal file
9
releasenotes/notes/py-typed-724eea7222640e6d.yaml
Normal file
9
releasenotes/notes/py-typed-724eea7222640e6d.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
---
|
||||
upgrade:
|
||||
- |
|
||||
We've added a `py.typed` file to Haystack to enable type information to be used by downstream projects, in line
|
||||
with PEP 561. This means Haystack's type hints will now be visible to type checkers in projects that depend on it.
|
||||
Haystack is primarily type checked using mypy (not pyright) and, despite our efforts, some type information can
|
||||
be incomplete or unreliable.
|
||||
If you use static type checking in your own project, you may notice some changes: previously, Haystack's types were
|
||||
effectively treated as `Any`, but now actual type information will be available and enforced.
|
||||
Loading…
x
Reference in New Issue
Block a user