From 93b2aaee09e976cd556747a6da05d8c4a5364643 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 8 Jan 2024 22:06:27 +0100 Subject: [PATCH] chore: move `DocumentJoiner` to new `joiners` package (#6692) * move DocumentJoiner to new joiners package * relnote * leftovers * fix docstrings generation * fix unrelated pydoc misconfiguration * more unrelated work, yay! * fix assertions --- docs/pydoc/config/caching.yml | 2 +- docs/pydoc/config/joiner.yml | 26 +++++++++++++++++++ docs/pydoc/config/router.yml | 2 +- e2e/pipelines/test_dense_doc_search.py | 3 ++- e2e/pipelines/test_eval_dense_doc_search.py | 3 ++- .../test_eval_hybrid_doc_search_pipeline.py | 2 +- .../test_hybrid_doc_search_pipeline.py | 2 +- examples/pipelines/indexing_pipeline.py | 3 ++- haystack/components/joiners/__init__.py | 3 +++ .../{routers => joiners}/document_joiner.py | 0 haystack/components/routers/__init__.py | 3 +-- haystack/pipeline_utils/indexing.py | 3 ++- ...entjoiner-to-joiners-7fe188d18d65ffcd.yaml | 10 +++++++ .../test_openai_document_embedder.py | 6 ++--- .../embedders/test_openai_text_embedder.py | 9 ++++--- test/components/joiners/__init__.py | 0 .../test_document_joiner.py | 2 +- 17 files changed, 62 insertions(+), 17 deletions(-) create mode 100644 docs/pydoc/config/joiner.yml create mode 100644 haystack/components/joiners/__init__.py rename haystack/components/{routers => joiners}/document_joiner.py (100%) create mode 100644 releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml create mode 100644 test/components/joiners/__init__.py rename test/components/{routers => joiners}/test_document_joiner.py (98%) diff --git a/docs/pydoc/config/caching.yml b/docs/pydoc/config/caching.yml index 88c7d6286..92898fb2c 100644 --- a/docs/pydoc/config/caching.yml +++ b/docs/pydoc/config/caching.yml @@ -1,7 +1,7 @@ loaders: - type: loaders.CustomPythonLoader search_path: [../../../haystack/components/caching] - modules: ["url_cache_checker"] + modules: ["cache_checker"] ignore_when_discovered: ["__init__"] processors: - type: filter diff --git a/docs/pydoc/config/joiner.yml b/docs/pydoc/config/joiner.yml new file mode 100644 index 000000000..34a9141a6 --- /dev/null +++ b/docs/pydoc/config/joiner.yml @@ -0,0 +1,26 @@ +loaders: + - type: loaders.CustomPythonLoader + search_path: [../../../haystack/components/joiners] + modules: ["document_joiner"] + ignore_when_discovered: ["__init__"] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: renderers.ReadmePreviewRenderer + excerpt: Routes data to the right component based on its file type or metadata. + category_slug: haystack-classes + title: Joiner API + slug: joiner-api + order: 140 + markdown: + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: joiner_api.md diff --git a/docs/pydoc/config/router.yml b/docs/pydoc/config/router.yml index e1cc0ab18..bac0d1cae 100644 --- a/docs/pydoc/config/router.yml +++ b/docs/pydoc/config/router.yml @@ -1,7 +1,7 @@ loaders: - type: loaders.CustomPythonLoader search_path: [../../../haystack/components/routers] - modules: ["document_joiner", "conditional_router", "file_type_router", "metadata_router", "text_language_router"] + modules: ["conditional_router", "file_type_router", "metadata_router", "text_language_router"] ignore_when_discovered: ["__init__"] processors: - type: filter diff --git a/e2e/pipelines/test_dense_doc_search.py b/e2e/pipelines/test_dense_doc_search.py index 02f77cd04..80aec5d27 100644 --- a/e2e/pipelines/test_dense_doc_search.py +++ b/e2e/pipelines/test_dense_doc_search.py @@ -4,7 +4,8 @@ from haystack import Pipeline from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder from haystack.components.converters import PyPDFToDocument, TextFileToDocument from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter -from haystack.components.routers import FileTypeRouter, DocumentJoiner +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.document_stores import InMemoryDocumentStore from haystack.components.retrievers import InMemoryEmbeddingRetriever diff --git a/e2e/pipelines/test_eval_dense_doc_search.py b/e2e/pipelines/test_eval_dense_doc_search.py index b17f052af..c4320c103 100644 --- a/e2e/pipelines/test_eval_dense_doc_search.py +++ b/e2e/pipelines/test_eval_dense_doc_search.py @@ -3,7 +3,8 @@ from haystack.components.converters import PyPDFToDocument, TextFileToDocument from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter from haystack.components.retrievers import InMemoryEmbeddingRetriever -from haystack.components.routers import DocumentJoiner, FileTypeRouter +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.dataclasses import Document from haystack.document_stores import InMemoryDocumentStore diff --git a/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py b/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py index f7b4455e6..ed1f0b1e9 100644 --- a/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py +++ b/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py @@ -2,7 +2,7 @@ from haystack import Document, Pipeline from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack.components.rankers import TransformersSimilarityRanker from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever -from haystack.components.routers.document_joiner import DocumentJoiner +from haystack.components.joiners.document_joiner import DocumentJoiner from haystack.document_stores import InMemoryDocumentStore from haystack.evaluation.eval import eval diff --git a/e2e/pipelines/test_hybrid_doc_search_pipeline.py b/e2e/pipelines/test_hybrid_doc_search_pipeline.py index a9ead31b1..fc6f6070e 100644 --- a/e2e/pipelines/test_hybrid_doc_search_pipeline.py +++ b/e2e/pipelines/test_hybrid_doc_search_pipeline.py @@ -3,7 +3,7 @@ import json from haystack import Pipeline, Document from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack.components.rankers import TransformersSimilarityRanker -from haystack.components.routers.document_joiner import DocumentJoiner +from haystack.components.joiners.document_joiner import DocumentJoiner from haystack.document_stores import InMemoryDocumentStore from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever diff --git a/examples/pipelines/indexing_pipeline.py b/examples/pipelines/indexing_pipeline.py index 8c9217eb7..ba61d0270 100644 --- a/examples/pipelines/indexing_pipeline.py +++ b/examples/pipelines/indexing_pipeline.py @@ -4,7 +4,8 @@ from haystack import Pipeline from haystack.components.embedders import SentenceTransformersDocumentEmbedder from haystack.components.converters import PyPDFToDocument, TextFileToDocument from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter -from haystack.components.routers import FileTypeRouter, DocumentJoiner +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.document_stores import InMemoryDocumentStore diff --git a/haystack/components/joiners/__init__.py b/haystack/components/joiners/__init__.py new file mode 100644 index 000000000..2ce551936 --- /dev/null +++ b/haystack/components/joiners/__init__.py @@ -0,0 +1,3 @@ +from haystack.components.joiners.document_joiner import DocumentJoiner + +__all__ = ["DocumentJoiner"] diff --git a/haystack/components/routers/document_joiner.py b/haystack/components/joiners/document_joiner.py similarity index 100% rename from haystack/components/routers/document_joiner.py rename to haystack/components/joiners/document_joiner.py diff --git a/haystack/components/routers/__init__.py b/haystack/components/routers/__init__.py index 65f8b9cb8..3eaeff616 100644 --- a/haystack/components/routers/__init__.py +++ b/haystack/components/routers/__init__.py @@ -1,7 +1,6 @@ -from haystack.components.routers.document_joiner import DocumentJoiner from haystack.components.routers.file_type_router import FileTypeRouter from haystack.components.routers.metadata_router import MetadataRouter from haystack.components.routers.conditional_router import ConditionalRouter from haystack.components.routers.text_language_router import TextLanguageRouter -__all__ = ["DocumentJoiner", "FileTypeRouter", "MetadataRouter", "TextLanguageRouter", "ConditionalRouter"] +__all__ = ["FileTypeRouter", "MetadataRouter", "TextLanguageRouter", "ConditionalRouter"] diff --git a/haystack/pipeline_utils/indexing.py b/haystack/pipeline_utils/indexing.py index 0dd188ebc..f42210502 100644 --- a/haystack/pipeline_utils/indexing.py +++ b/haystack/pipeline_utils/indexing.py @@ -11,7 +11,8 @@ from haystack.components.converters import TextFileToDocument from haystack.components.embedders import SentenceTransformersDocumentEmbedder, OpenAIDocumentEmbedder from haystack.components.fetchers import LinkContentFetcher from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter -from haystack.components.routers import FileTypeRouter, DocumentJoiner +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.document_stores.protocol import DocumentStore diff --git a/releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml b/releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml new file mode 100644 index 000000000..59948c915 --- /dev/null +++ b/releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml @@ -0,0 +1,10 @@ +--- +upgrade: + - | + Change any occurrence of: + from haystack.components.routers.document_joiner import DocumentJoiner + to: + from haystack.components.joiners.document_joiner import DocumentJoiner +enhancements: + - | + Create a new package called `joiners` and move `DocumentJoiner` there for clarity. diff --git a/test/components/embedders/test_openai_document_embedder.py b/test/components/embedders/test_openai_document_embedder.py index 4cf7b6104..390c559d3 100644 --- a/test/components/embedders/test_openai_document_embedder.py +++ b/test/components/embedders/test_openai_document_embedder.py @@ -172,7 +172,7 @@ class TestOpenAIDocumentEmbedder: Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), ] - model = "text-similarity-ada-001" + model = "text-embedding-ada-002" embedder = OpenAIDocumentEmbedder(model_name=model, meta_fields_to_embed=["topic"], embedding_separator=" | ") @@ -185,6 +185,6 @@ class TestOpenAIDocumentEmbedder: for doc in documents_with_embeddings: assert isinstance(doc, Document) assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 1024 + assert len(doc.embedding) == 1536 assert all(isinstance(x, float) for x in doc.embedding) - assert metadata == {"model": "text-similarity-ada:001", "usage": {"prompt_tokens": 15, "total_tokens": 15}} + assert metadata == {"model": "text-embedding-ada-002-v2", "usage": {"prompt_tokens": 15, "total_tokens": 15}} diff --git a/test/components/embedders/test_openai_text_embedder.py b/test/components/embedders/test_openai_text_embedder.py index 8b16c798f..c9757d88a 100644 --- a/test/components/embedders/test_openai_text_embedder.py +++ b/test/components/embedders/test_openai_text_embedder.py @@ -79,11 +79,14 @@ class TestOpenAITextEmbedder: @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set") @pytest.mark.integration def test_run(self): - model = "text-similarity-ada-001" + model = "text-embedding-ada-002" embedder = OpenAITextEmbedder(model_name=model, prefix="prefix ", suffix=" suffix") result = embedder.run(text="The food was delicious") - assert len(result["embedding"]) == 1024 + assert len(result["embedding"]) == 1536 assert all(isinstance(x, float) for x in result["embedding"]) - assert result["meta"] == {"model": "text-similarity-ada:001", "usage": {"prompt_tokens": 6, "total_tokens": 6}} + assert result["meta"] == { + "model": "text-embedding-ada-002-v2", + "usage": {"prompt_tokens": 6, "total_tokens": 6}, + } diff --git a/test/components/joiners/__init__.py b/test/components/joiners/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/components/routers/test_document_joiner.py b/test/components/joiners/test_document_joiner.py similarity index 98% rename from test/components/routers/test_document_joiner.py rename to test/components/joiners/test_document_joiner.py index 3362c1c15..af80ccc2f 100644 --- a/test/components/routers/test_document_joiner.py +++ b/test/components/joiners/test_document_joiner.py @@ -3,7 +3,7 @@ import logging import pytest from haystack import Document -from haystack.components.routers.document_joiner import DocumentJoiner +from haystack.components.joiners.document_joiner import DocumentJoiner class TestDocumentJoiner: