diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 80db5055c..74f56663d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,12 +33,12 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} - name: Install dependencies if: steps.cache-python-env.outputs.cache-hit != 'true' run: | python -m pip install --upgrade pip - pip install pytest + pip install --upgrade --upgrade-strategy eager -r requirements-dev.txt -e . pip install --upgrade --upgrade-strategy eager -r requirements.txt -e . prepare-build: @@ -69,12 +69,12 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} - name: Run Elasticsearch run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx128m" elasticsearch:7.9.2 - name: Run Milvus - run: docker run -d -p 19530:19530 -p 19121:19121 milvusdb/milvus:0.10.5-cpu-d010621-4eda95 + run: docker run -d -p 19530:19530 -p 19121:19121 milvusdb/milvus:1.0.0-cpu-d030521-1ea92e - name: Run Apache Tika run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1 diff --git a/README.md b/README.md index 15ead018a..7efeb8c00 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ haystack-api_1 | [2021-01-01 10:21:58 +0000] [17] [INFO] Application startup c You should see the following: -![image](https://github.com/deepset-ai/haystack/blob/master/docs/_src/img/streamlit_ui_screenshot.png) +![image](https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/_src/img/streamlit_ui_screenshot.png) You can then try different queries against a pre-defined set of indexed articles related to Game of Thrones. diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index c85b330f8..f6c0fa5b7 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -35,7 +35,7 @@ in garbled text. ```python | @abstractmethod - | convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> Dict[str, Any] ``` Convert a file to a dictionary containing the text and any associated meta data. @@ -57,6 +57,7 @@ The rows containing strings are thus retained in this option. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. +- `encoding`: Select the file encoding (default is `utf-8`) #### validate\_language @@ -101,7 +102,7 @@ in garbled text. #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "utf-8") -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> Dict[str, Any] ``` Reads text from a txt file and executes optional preprocessing steps. @@ -120,6 +121,7 @@ The rows containing strings are thus retained in this option. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. +- `encoding`: Select the file encoding (default is `utf-8`) **Returns**: @@ -139,7 +141,7 @@ class DocxToTextConverter(BaseConverter) #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None) -> Dict[str, Any] ``` Extract text from a .docx file. @@ -160,6 +162,7 @@ The rows containing strings are thus retained in this option. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. +- `encoding`: Not applicable # Module tika @@ -196,7 +199,7 @@ in garbled text. #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None) -> Dict[str, Any] ``` **Arguments**: @@ -213,6 +216,7 @@ The rows containing strings are thus retained in this option. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. +- `encoding`: Not applicable **Returns**: @@ -252,7 +256,7 @@ in garbled text. #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "Latin1") -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "Latin1") -> Dict[str, Any] ``` Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) diff --git a/haystack/file_converter/__init__.py b/haystack/file_converter/__init__.py index d55c5b39c..2da312836 100644 --- a/haystack/file_converter/__init__.py +++ b/haystack/file_converter/__init__.py @@ -2,3 +2,4 @@ from haystack.file_converter.docx import DocxToTextConverter from haystack.file_converter.pdf import PDFToTextConverter from haystack.file_converter.tika import TikaConverter from haystack.file_converter.txt import TextConverter +from haystack.file_converter.markdown import MarkdownConverter diff --git a/haystack/file_converter/base.py b/haystack/file_converter/base.py index d8daa9623..b4fae82e9 100644 --- a/haystack/file_converter/base.py +++ b/haystack/file_converter/base.py @@ -37,6 +37,7 @@ class BaseConverter(BaseComponent): meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", ) -> Dict[str, Any]: """ Convert a file to a dictionary containing the text and any associated meta data. @@ -56,6 +57,7 @@ class BaseConverter(BaseComponent): This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. + :param encoding: Select the file encoding (default is `utf-8`) """ pass diff --git a/haystack/file_converter/docx.py b/haystack/file_converter/docx.py index bef961968..742e26f84 100644 --- a/haystack/file_converter/docx.py +++ b/haystack/file_converter/docx.py @@ -16,6 +16,7 @@ class DocxToTextConverter(BaseConverter): meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, ) -> Dict[str, Any]: """ Extract text from a .docx file. @@ -34,6 +35,7 @@ class DocxToTextConverter(BaseConverter): This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. + :param encoding: Not applicable """ if remove_numeric_tables is None: remove_numeric_tables = self.remove_numeric_tables diff --git a/haystack/file_converter/markdown.py b/haystack/file_converter/markdown.py new file mode 100644 index 000000000..7b53cc9b4 --- /dev/null +++ b/haystack/file_converter/markdown.py @@ -0,0 +1,68 @@ +import logging +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +from haystack.file_converter.base import BaseConverter + +logger = logging.getLogger(__name__) + + +class MarkdownConverter(BaseConverter): + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + ) -> Dict[str, Any]: + """ + Reads text from a txt file and executes optional preprocessing steps. + + :param file_path: path of the file to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param encoding: Select the file encoding (default is `utf-8`) + :param remove_numeric_tables: Not applicable + :param valid_languages: Not applicable + + :return: Dict of format {"text": "The text from file", "meta": meta}} + """ + with open(file_path, encoding=encoding, errors="ignore") as f: + markdown_text = f.read() + text = self.markdown_to_text(markdown_text) + document = {"text": text, "meta": meta} + return document + + # Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe + @staticmethod + def markdown_to_text(markdown_string: str) -> str: + """ + Converts a markdown string to plaintext + + :param markdown_string: String in markdown format + """ + try: + from bs4 import BeautifulSoup + except ImportError: + raise ImportError("Can't find package `beautifulsoup4` \n" + "You can install it via `pip install beautifulsoup4`") + + try: + from markdown import markdown + except ImportError: + raise ImportError("Can't find package `markdown` \n" + "You can install it via `pip install markdown`") + + # md -> html -> text since BeautifulSoup can extract text cleanly + html = markdown(markdown_string) + + # remove code snippets + html = re.sub(r'
(.*?)
', ' ', html) + html = re.sub(r'(.*?)', ' ', html) + + # extract text + soup = BeautifulSoup(html, "html.parser") + text = ''.join(soup.findAll(text=True)) + + return text diff --git a/haystack/file_converter/pdf.py b/haystack/file_converter/pdf.py index 9c5c61f5d..03c9bd3fe 100644 --- a/haystack/file_converter/pdf.py +++ b/haystack/file_converter/pdf.py @@ -46,7 +46,7 @@ class PDFToTextConverter(BaseConverter): meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, - encoding: str = "Latin1", + encoding: Optional[str] = "Latin1", ) -> Dict[str, Any]: """ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) @@ -118,7 +118,7 @@ class PDFToTextConverter(BaseConverter): document = {"text": text, "meta": meta} return document - def _read_pdf(self, file_path: Path, layout: bool, encoding: str) -> List[str]: + def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1") -> List[str]: """ Extract pages from the pdf file at file_path. @@ -130,7 +130,7 @@ class PDFToTextConverter(BaseConverter): command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"] else: command = ["pdftotext", "-enc", encoding, str(file_path), "-"] - output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) + output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore document = output.stdout.decode(errors="ignore") pages = document.split("\f") pages = pages[:-1] # the last page in the split is always empty. diff --git a/haystack/file_converter/tika.py b/haystack/file_converter/tika.py index bec82965e..8c1f27ae4 100644 --- a/haystack/file_converter/tika.py +++ b/haystack/file_converter/tika.py @@ -71,6 +71,7 @@ class TikaConverter(BaseConverter): meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, ) -> Dict[str, Any]: """ :param file_path: path of the file to convert @@ -80,11 +81,12 @@ class TikaConverter(BaseConverter): does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option. - :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. + :param encoding: Not applicable :return: a list of pages and the extracted meta data of the file. """ diff --git a/haystack/file_converter/txt.py b/haystack/file_converter/txt.py index 8cc97050c..fa1160fee 100644 --- a/haystack/file_converter/txt.py +++ b/haystack/file_converter/txt.py @@ -30,7 +30,7 @@ class TextConverter(BaseConverter): meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, - encoding: str = "utf-8", + encoding: Optional[str] = "utf-8", ) -> Dict[str, Any]: """ Reads text from a txt file and executes optional preprocessing steps. @@ -47,6 +47,7 @@ class TextConverter(BaseConverter): This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. + :param encoding: Select the file encoding (default is `utf-8`) :return: Dict of format {"text": "The text from file", "meta": meta}} @@ -87,7 +88,7 @@ class TextConverter(BaseConverter): f"been decoded in the correct text format." ) - text = "".join(pages) + text = "".join(cleaned_pages) document = {"text": text, "meta": meta} return document diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..3ce02c73b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,8 @@ +# Add extra dependencies only required for tests and local dev setup +mypy +pytest +sentence-transformers +selenium +webdriver-manager +beautifulsoup4 +markdown diff --git a/requirements.txt b/requirements.txt index 9cfd9fb6a..fa84c0e02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ langdetect # for PDF conversions # optional: sentence-transformers python-multipart python-docx +sqlalchemy>=1.4.2 sqlalchemy_utils # for using FAISS with GPUs, install faiss-gpu faiss-cpu>=1.6.3 @@ -27,4 +28,4 @@ networkx pymilvus # Optional: For crawling #selenium -#webdriver-manager \ No newline at end of file +#webdriver-manager diff --git a/test/samples/markdown/sample.md b/test/samples/markdown/sample.md new file mode 100644 index 000000000..7c443d2d3 --- /dev/null +++ b/test/samples/markdown/sample.md @@ -0,0 +1,55 @@ +## What to build with Haystack + +- **Ask questions in natural language** and find granular answers in your own documents. +- Perform **semantic search** and retrieve documents according to meaning not keywords +- Use **off-the-shelf models** or **fine-tune** them to your own domain. +- Use **user feedback** to evaluate, benchmark and continuously improve your live models. +- Leverage existing **knowledge bases** and better handle the long tail of queries that **chatbots** receive. +- **Automate processes** by automatically applying a list of questions to new documents and using the extracted answers. + +![Logo](https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/_src/img/logo_white_background.png) + + +## Core Features + +- **Latest models**: Utilize all latest transformer based models (e.g. BERT, RoBERTa, MiniLM) for extractive QA, generative QA and document retrieval. +- **Modular**: Multiple choices to fit your tech stack and use case. Pick your favorite database, file converter or modeling framwework. +- **Open**: 100% compatible with HuggingFace's model hub. Tight interfaces to other frameworks (e.g. Transformers, FARM, sentence-transformers) +- **Scalable**: Scale to millions of docs via retrievers, production-ready backends like Elasticsearch / FAISS and a fastAPI REST API +- **End-to-End**: All tooling in one place: file conversion, cleaning, splitting, training, eval, inference, labeling ... +- **Developer friendly**: Easy to debug, extend and modify. +- **Customizable**: Fine-tune models to your own domain or implement your custom DocumentStore. +- **Continuous Learning**: Collect new training data via user feedback in production & improve your models continuously + +| | | +|-|-| +| :ledger: [Docs](https://haystack.deepset.ai/docs/intromd) | Usage, Guides, API documentation ...| +| :beginner: [Quick Demo](https://github.com/deepset-ai/haystack/#quick-demo) | Quickly see what Haystack offers | +| :floppy_disk: [Installation](https://github.com/deepset-ai/haystack/#installation) | How to install Haystack | +| :art: [Key Components](https://github.com/deepset-ai/haystack/#key-components) | Overview of core concepts | +| :mortar_board: [Tutorials](https://github.com/deepset-ai/haystack/#tutorials) | Jupyter/Colab Notebooks & Scripts | +| :eyes: [How to use Haystack](https://github.com/deepset-ai/haystack/#how-to-use-haystack) | Basic explanation of concepts, options and usage | +| :heart: [Contributing](https://github.com/deepset-ai/haystack/#heart-contributing) | We welcome all contributions! | +| :bar_chart: [Benchmarks](https://haystack.deepset.ai/bm/benchmarks) | Speed & Accuracy of Retriever, Readers and DocumentStores | +| :telescope: [Roadmap](https://haystack.deepset.ai/docs/latest/roadmapmd) | Public roadmap of Haystack | +| :pray: [Slack](https://haystack.deepset.ai/community/join) | Join our community on Slack | +| :bird: [Twitter](https://twitter.com/deepset_ai) | Follow us on Twitter for news and updates | +| :newspaper: [Blog](https://medium.com/deepset-ai) | Read our articles on Medium | + + +## Quick Demo + +The quickest way to see what Haystack offers is to start a [Docker Compose](https://docs.docker.com/compose/) demo application: + +**1. Update/install Docker and Docker Compose, then launch Docker** + +``` + # apt-get update && apt-get install docker && apt-get install docker-compose + # service docker start +``` + +**2. Clone Haystack repository** + +``` + # git clone https://github.com/deepset-ai/haystack.git +``` diff --git a/test/test_file_converter.py b/test/test_file_converter.py index 7870953c6..ab2d54502 100644 --- a/test/test_file_converter.py +++ b/test/test_file_converter.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest +from haystack.file_converter import MarkdownConverter from haystack.file_converter.docx import DocxToTextConverter from haystack.file_converter.pdf import PDFToTextConverter from haystack.file_converter.tika import TikaConverter @@ -50,3 +51,9 @@ def test_docx_converter(): converter = DocxToTextConverter() document = converter.convert(file_path=Path("samples/docx/sample_docx.docx")) assert document["text"].startswith("Sample Docx File") + + +def test_markdown_converter(): + converter = MarkdownConverter() + document = converter.convert(file_path=Path("samples/markdown/sample.md")) + assert document["text"].startswith("What to build with Haystack")