diff --git a/.github/workflows/deploy_website.yml b/.github/workflows/deploy_website.yml index 1a40435c6..b7593a3ba 100644 --- a/.github/workflows/deploy_website.yml +++ b/.github/workflows/deploy_website.yml @@ -13,28 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - - name: Set up Python 3.7 - uses: actions/setup-python@v2 - with: - python-version: 3.7 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install 'pydoc-markdown>=3.0.0,<4.0.0' - pip install mkdocs - pip install jupytercontrib - - # Generates the docstrings and tutorials so that we have the latest for the deployment - - name: Generate Docstrings and Tutorials - run: | - cd docs/_src/api/api/ - ./generate_docstrings.sh - cd ../../tutorials/tutorials/ - python3 convert_ipynb.py - + # Creates dispatch event for haystack-website repo - name: Repository Dispatch uses: peter-evans/repository-dispatch@v1 diff --git a/.github/workflows/deploy_website_staging.yml b/.github/workflows/deploy_website_staging.yml new file mode 100644 index 000000000..8b939669b --- /dev/null +++ b/.github/workflows/deploy_website_staging.yml @@ -0,0 +1,26 @@ +name: Deploy website + +# Controls when the action will run. Triggers the workflow on push +# events but only for the master branch +on: + push: + branches-ignore: + - master + - benchmarks + +jobs: + # This workflow contains a single job called "build" + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + steps: + + # Creates dispatch event for haystack-website repo + - name: Repository Dispatch + uses: peter-evans/repository-dispatch@v1 + with: + token: ${{ secrets.PUBLIC_REPO_ACCESS_TOKEN }} + repository: deepset-ai/haystack-website + event-type: deploy-website-staging + client-payload: '{"ref": "${{ github.ref }}"}' diff --git a/.github/workflows/update_docs.yml b/.github/workflows/update_docs.yml new file mode 100644 index 000000000..2fc455efe --- /dev/null +++ b/.github/workflows/update_docs.yml @@ -0,0 +1,55 @@ +name: Update Docstrings and Tutorials + +# Controls when the action will run. Triggers the workflow on push +# events but only for the master branch +on: + push: + branches-ignore: + - master + - benchmarks + +jobs: + # This workflow contains a single job called "build" + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + with: + persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal token + fetch-depth: 0 # otherwise, you will failed to push refs to dest repo + + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install 'pydoc-markdown>=3.0.0,<4.0.0' + pip install mkdocs + pip install jupytercontrib + + # Generates the docstrings and tutorials so that we have the latest for the deployment + - name: Generate Docstrings and Tutorials + run: | + cd docs/_src/api/api/ + ./generate_docstrings.sh + cd ../../tutorials/tutorials/ + python3 convert_ipynb.py + cd ../../../../ + git status + + - name: Commit files + run: | + git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add . + git commit -m "Add latest docstring and tutorial changes" -a || echo "No changes to commit" + - name: Push changes + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: ${{ github.ref }} diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index b8387da33..43c9c7e79 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -1,3 +1,62 @@ + +# Module base + + +## BaseConverter Objects + +```python +class BaseConverter() +``` + +Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore. + + +#### \_\_init\_\_ + +```python + | __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) +``` + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. + + +#### convert + +```python + | @abstractmethod + | convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any] +``` + +Convert a file to a dictionary containing the text and any associated meta data. + +File converters may extract file meta like name or size. In addition to it, user +supplied meta data like author, url, external IDs can be supplied as a dictionary. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. + + +#### validate\_language + +```python + | validate_language(text: str) -> bool +``` + +Validate if the language of the text is one of valid languages. + # Module txt @@ -118,65 +177,6 @@ in garbled text. a list of pages and the extracted meta data of the file. - -# Module base - - -## BaseConverter Objects - -```python -class BaseConverter() -``` - -Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore. - - -#### \_\_init\_\_ - -```python - | __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -``` - -**Arguments**: - -- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. -The tabular structures in documents might be noise for the reader model if it -does not have table parsing capability for finding answers. However, tables -may also have long strings that could possible candidate for searching answers. -The rows containing strings are thus retained in this option. -- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 -(https://en.wikipedia.org/wiki/ISO_639-1) format. -This option can be used to add test for encoding errors. If the extracted text is -not one of the valid languages, then it might likely be encoding error resulting -in garbled text. - - -#### convert - -```python - | @abstractmethod - | convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any] -``` - -Convert a file to a dictionary containing the text and any associated meta data. - -File converters may extract file meta like name or size. In addition to it, user -supplied meta data like author, url, external IDs can be supplied as a dictionary. - -**Arguments**: - -- `file_path`: path of the file to convert -- `meta`: dictionary of meta data key-value pairs to append in the returned document. - - -#### validate\_language - -```python - | validate_language(text: str) -> bool -``` - -Validate if the language of the text is one of valid languages. - # Module pdf diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md index 25baee4ad..0004b6eba 100644 --- a/docs/_src/api/api/generator.md +++ b/docs/_src/api/api/generator.md @@ -1,3 +1,35 @@ + +# Module base + + +## BaseGenerator Objects + +```python +class BaseGenerator(ABC) +``` + +Abstract class for Generators + + +#### predict + +```python + | @abstractmethod + | predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict +``` + +Abstract method to generate answers. + +**Arguments**: + +- `query`: Query +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `top_k`: Number of returned answers + +**Returns**: + +Generated answers plus additional infos in a dict + # Module transformers @@ -106,35 +138,3 @@ Generated answers plus additional infos in a dict like this: | }}]} ``` - -# Module base - - -## BaseGenerator Objects - -```python -class BaseGenerator(ABC) -``` - -Abstract class for Generators - - -#### predict - -```python - | @abstractmethod - | predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict -``` - -Abstract method to generate answers. - -**Arguments**: - -- `query`: Query -- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. -- `top_k`: Number of returned answers - -**Returns**: - -Generated answers plus additional infos in a dict - diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index f2e834a4e..dbd32e25c 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -207,6 +207,44 @@ Initialize a Pipeline for Generative Question Answering. - `generator`: Generator instance - `retriever`: Retriever instance + +## SearchSummarizationPipeline Objects + +```python +class SearchSummarizationPipeline(BaseStandardPipeline) +``` + + +#### \_\_init\_\_ + +```python + | __init__(summarizer: BaseSummarizer, retriever: BaseRetriever) +``` + +Initialize a Pipeline that retrieves documents for a query and then summarizes those documents. + +**Arguments**: + +- `summarizer`: Summarizer instance +- `retriever`: Retriever instance + + +#### run + +```python + | run(query: str, filters: Optional[Dict] = None, top_k_retriever: int = 10, generate_single_summary: bool = False, return_in_answer_format=False) +``` + +**Arguments**: + +- `query`: Your search query +- `filters`: +- `top_k_retriever`: Number of top docs the retriever should pass to the summarizer. +The higher this value, the slower your pipeline. +- `generate_single_summary`: Whether to generate single summary from all retrieved docs (True) or one per doc (False). +- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer format used in other QA pipelines (True). +With the latter, you can use this pipeline as a "drop-in replacement" for other QA pipelines. + ## FAQPipeline Objects diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index afa6fc428..162cf4cae 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -1,3 +1,6 @@ + +# Module base + # Module farm @@ -378,6 +381,3 @@ Example: Dict containing query and answers - -# Module base - diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 7930b5fac..3b7708d6b 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -1,3 +1,74 @@ + +# Module base + + +## BaseRetriever Objects + +```python +class BaseRetriever(ABC) +``` + + +#### retrieve + +```python + | @abstractmethod + | retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents + + +#### timing + +```python + | timing(fn) +``` + +Wrapper method used to time functions. + + +#### eval + +```python + | eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict +``` + +Performs evaluation on the Retriever. +Retriever is evaluated based on whether it finds the correct document given the query string and at which +position in the ranking of documents the correct document is. + +| Returns a dict containing the following metrics: + +- "recall": Proportion of questions for which correct document is among retrieved documents +- "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. +Only considers the highest ranked relevant document. +- "map": Mean of average precision for each question. Rewards retrievers that give relevant +documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``, +average precision is normalized by the number of retrieved relevant documents per query. +If ``open_domain=False``, average precision is normalized by the number of all relevant documents +per query. + +**Arguments**: + +- `label_index`: Index/Table in DocumentStore where labeled questions are stored +- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored +- `top_k`: How many documents to return per query +- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is +contained in the retrieved docs (common approach in open-domain QA). +If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids +are within ids explicitly stated in the labels. +- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary +contains the keys "predictions" and "metrics". + # Module sparse @@ -408,74 +479,3 @@ Create embeddings for a list of passages. For this Retriever type: The same as c Embeddings, one per input passage - -# Module base - - -## BaseRetriever Objects - -```python -class BaseRetriever(ABC) -``` - - -#### retrieve - -```python - | @abstractmethod - | retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document] -``` - -Scan through documents in DocumentStore and return a small number documents -that are most relevant to the query. - -**Arguments**: - -- `query`: The query -- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field -- `top_k`: How many documents to return per query. -- `index`: The name of the index in the DocumentStore from which to retrieve documents - - -#### timing - -```python - | timing(fn) -``` - -Wrapper method used to time functions. - - -#### eval - -```python - | eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict -``` - -Performs evaluation on the Retriever. -Retriever is evaluated based on whether it finds the correct document given the query string and at which -position in the ranking of documents the correct document is. - -| Returns a dict containing the following metrics: - -- "recall": Proportion of questions for which correct document is among retrieved documents -- "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. -Only considers the highest ranked relevant document. -- "map": Mean of average precision for each question. Rewards retrievers that give relevant -documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``, -average precision is normalized by the number of retrieved relevant documents per query. -If ``open_domain=False``, average precision is normalized by the number of all relevant documents -per query. - -**Arguments**: - -- `label_index`: Index/Table in DocumentStore where labeled questions are stored -- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored -- `top_k`: How many documents to return per query -- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is -contained in the retrieved docs (common approach in open-domain QA). -If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids -are within ids explicitly stated in the labels. -- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary -contains the keys "predictions" and "metrics". -