mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-31 11:56:35 +00:00
Add Markdown file convertor (#875)
This commit is contained in:
parent
47dc069afe
commit
e904deefa7
8
.github/workflows/ci.yml
vendored
8
.github/workflows/ci.yml
vendored
@ -33,12 +33,12 @@ jobs:
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}
|
||||
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
- name: Install dependencies
|
||||
if: steps.cache-python-env.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pytest
|
||||
pip install --upgrade --upgrade-strategy eager -r requirements-dev.txt -e .
|
||||
pip install --upgrade --upgrade-strategy eager -r requirements.txt -e .
|
||||
|
||||
prepare-build:
|
||||
@ -69,12 +69,12 @@ jobs:
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}
|
||||
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
- name: Run Elasticsearch
|
||||
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx128m" elasticsearch:7.9.2
|
||||
|
||||
- name: Run Milvus
|
||||
run: docker run -d -p 19530:19530 -p 19121:19121 milvusdb/milvus:0.10.5-cpu-d010621-4eda95
|
||||
run: docker run -d -p 19530:19530 -p 19121:19121 milvusdb/milvus:1.0.0-cpu-d030521-1ea92e
|
||||
|
||||
- name: Run Apache Tika
|
||||
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
|
||||
|
@ -104,7 +104,7 @@ haystack-api_1 | [2021-01-01 10:21:58 +0000] [17] [INFO] Application startup c
|
||||
|
||||
You should see the following:
|
||||
|
||||

|
||||

|
||||
|
||||
You can then try different queries against a pre-defined set of indexed articles related to Game of Thrones.
|
||||
|
||||
|
@ -35,7 +35,7 @@ in garbled text.
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -57,6 +57,7 @@ The rows containing strings are thus retained in this option.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Select the file encoding (default is `utf-8`)
|
||||
|
||||
<a name="base.BaseConverter.validate_language"></a>
|
||||
#### validate\_language
|
||||
@ -101,7 +102,7 @@ in garbled text.
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "utf-8") -> Dict[str, Any]
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
@ -120,6 +121,7 @@ The rows containing strings are thus retained in this option.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Select the file encoding (default is `utf-8`)
|
||||
|
||||
**Returns**:
|
||||
|
||||
@ -139,7 +141,7 @@ class DocxToTextConverter(BaseConverter)
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Extract text from a .docx file.
|
||||
@ -160,6 +162,7 @@ The rows containing strings are thus retained in this option.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Not applicable
|
||||
|
||||
<a name="tika"></a>
|
||||
# Module tika
|
||||
@ -196,7 +199,7 @@ in garbled text.
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -213,6 +216,7 @@ The rows containing strings are thus retained in this option.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Not applicable
|
||||
|
||||
**Returns**:
|
||||
|
||||
@ -252,7 +256,7 @@ in garbled text.
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "Latin1") -> Dict[str, Any]
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "Latin1") -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||
|
@ -2,3 +2,4 @@ from haystack.file_converter.docx import DocxToTextConverter
|
||||
from haystack.file_converter.pdf import PDFToTextConverter
|
||||
from haystack.file_converter.tika import TikaConverter
|
||||
from haystack.file_converter.txt import TextConverter
|
||||
from haystack.file_converter.markdown import MarkdownConverter
|
||||
|
@ -37,6 +37,7 @@ class BaseConverter(BaseComponent):
|
||||
meta: Optional[Dict[str, str]],
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -56,6 +57,7 @@ class BaseConverter(BaseComponent):
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Select the file encoding (default is `utf-8`)
|
||||
"""
|
||||
pass
|
||||
|
||||
|
@ -16,6 +16,7 @@ class DocxToTextConverter(BaseConverter):
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract text from a .docx file.
|
||||
@ -34,6 +35,7 @@ class DocxToTextConverter(BaseConverter):
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Not applicable
|
||||
"""
|
||||
if remove_numeric_tables is None:
|
||||
remove_numeric_tables = self.remove_numeric_tables
|
||||
|
68
haystack/file_converter/markdown.py
Normal file
68
haystack/file_converter/markdown.py
Normal file
@ -0,0 +1,68 @@
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from haystack.file_converter.base import BaseConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownConverter(BaseConverter):
|
||||
def convert(
|
||||
self,
|
||||
file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
|
||||
:param file_path: path of the file to convert
|
||||
:param meta: dictionary of meta data key-value pairs to append in the returned document.
|
||||
:param encoding: Select the file encoding (default is `utf-8`)
|
||||
:param remove_numeric_tables: Not applicable
|
||||
:param valid_languages: Not applicable
|
||||
|
||||
:return: Dict of format {"text": "The text from file", "meta": meta}}
|
||||
"""
|
||||
with open(file_path, encoding=encoding, errors="ignore") as f:
|
||||
markdown_text = f.read()
|
||||
text = self.markdown_to_text(markdown_text)
|
||||
document = {"text": text, "meta": meta}
|
||||
return document
|
||||
|
||||
# Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
|
||||
@staticmethod
|
||||
def markdown_to_text(markdown_string: str) -> str:
|
||||
"""
|
||||
Converts a markdown string to plaintext
|
||||
|
||||
:param markdown_string: String in markdown format
|
||||
"""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
raise ImportError("Can't find package `beautifulsoup4` \n"
|
||||
"You can install it via `pip install beautifulsoup4`")
|
||||
|
||||
try:
|
||||
from markdown import markdown
|
||||
except ImportError:
|
||||
raise ImportError("Can't find package `markdown` \n"
|
||||
"You can install it via `pip install markdown`")
|
||||
|
||||
# md -> html -> text since BeautifulSoup can extract text cleanly
|
||||
html = markdown(markdown_string)
|
||||
|
||||
# remove code snippets
|
||||
html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
|
||||
html = re.sub(r'<code>(.*?)</code >', ' ', html)
|
||||
|
||||
# extract text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = ''.join(soup.findAll(text=True))
|
||||
|
||||
return text
|
@ -46,7 +46,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: str = "Latin1",
|
||||
encoding: Optional[str] = "Latin1",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||
@ -118,7 +118,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
document = {"text": text, "meta": meta}
|
||||
return document
|
||||
|
||||
def _read_pdf(self, file_path: Path, layout: bool, encoding: str) -> List[str]:
|
||||
def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1") -> List[str]:
|
||||
"""
|
||||
Extract pages from the pdf file at file_path.
|
||||
|
||||
@ -130,7 +130,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
|
||||
else:
|
||||
command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
|
||||
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False)
|
||||
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore
|
||||
document = output.stdout.decode(errors="ignore")
|
||||
pages = document.split("\f")
|
||||
pages = pages[:-1] # the last page in the split is always empty.
|
||||
|
@ -71,6 +71,7 @@ class TikaConverter(BaseConverter):
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
:param file_path: path of the file to convert
|
||||
@ -80,11 +81,12 @@ class TikaConverter(BaseConverter):
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Not applicable
|
||||
|
||||
:return: a list of pages and the extracted meta data of the file.
|
||||
"""
|
||||
|
@ -30,7 +30,7 @@ class TextConverter(BaseConverter):
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: str = "utf-8",
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
@ -47,6 +47,7 @@ class TextConverter(BaseConverter):
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Select the file encoding (default is `utf-8`)
|
||||
|
||||
:return: Dict of format {"text": "The text from file", "meta": meta}}
|
||||
|
||||
@ -87,7 +88,7 @@ class TextConverter(BaseConverter):
|
||||
f"been decoded in the correct text format."
|
||||
)
|
||||
|
||||
text = "".join(pages)
|
||||
text = "".join(cleaned_pages)
|
||||
document = {"text": text, "meta": meta}
|
||||
return document
|
||||
|
||||
|
8
requirements-dev.txt
Normal file
8
requirements-dev.txt
Normal file
@ -0,0 +1,8 @@
|
||||
# Add extra dependencies only required for tests and local dev setup
|
||||
mypy
|
||||
pytest
|
||||
sentence-transformers
|
||||
selenium
|
||||
webdriver-manager
|
||||
beautifulsoup4
|
||||
markdown
|
@ -14,6 +14,7 @@ langdetect # for PDF conversions
|
||||
# optional: sentence-transformers
|
||||
python-multipart
|
||||
python-docx
|
||||
sqlalchemy>=1.4.2
|
||||
sqlalchemy_utils
|
||||
# for using FAISS with GPUs, install faiss-gpu
|
||||
faiss-cpu>=1.6.3
|
||||
@ -27,4 +28,4 @@ networkx
|
||||
pymilvus
|
||||
# Optional: For crawling
|
||||
#selenium
|
||||
#webdriver-manager
|
||||
#webdriver-manager
|
||||
|
55
test/samples/markdown/sample.md
Normal file
55
test/samples/markdown/sample.md
Normal file
@ -0,0 +1,55 @@
|
||||
## What to build with Haystack
|
||||
|
||||
- **Ask questions in natural language** and find granular answers in your own documents.
|
||||
- Perform **semantic search** and retrieve documents according to meaning not keywords
|
||||
- Use **off-the-shelf models** or **fine-tune** them to your own domain.
|
||||
- Use **user feedback** to evaluate, benchmark and continuously improve your live models.
|
||||
- Leverage existing **knowledge bases** and better handle the long tail of queries that **chatbots** receive.
|
||||
- **Automate processes** by automatically applying a list of questions to new documents and using the extracted answers.
|
||||
|
||||

|
||||
|
||||
|
||||
## Core Features
|
||||
|
||||
- **Latest models**: Utilize all latest transformer based models (e.g. BERT, RoBERTa, MiniLM) for extractive QA, generative QA and document retrieval.
|
||||
- **Modular**: Multiple choices to fit your tech stack and use case. Pick your favorite database, file converter or modeling framwework.
|
||||
- **Open**: 100% compatible with HuggingFace's model hub. Tight interfaces to other frameworks (e.g. Transformers, FARM, sentence-transformers)
|
||||
- **Scalable**: Scale to millions of docs via retrievers, production-ready backends like Elasticsearch / FAISS and a fastAPI REST API
|
||||
- **End-to-End**: All tooling in one place: file conversion, cleaning, splitting, training, eval, inference, labeling ...
|
||||
- **Developer friendly**: Easy to debug, extend and modify.
|
||||
- **Customizable**: Fine-tune models to your own domain or implement your custom DocumentStore.
|
||||
- **Continuous Learning**: Collect new training data via user feedback in production & improve your models continuously
|
||||
|
||||
| | |
|
||||
|-|-|
|
||||
| :ledger: [Docs](https://haystack.deepset.ai/docs/intromd) | Usage, Guides, API documentation ...|
|
||||
| :beginner: [Quick Demo](https://github.com/deepset-ai/haystack/#quick-demo) | Quickly see what Haystack offers |
|
||||
| :floppy_disk: [Installation](https://github.com/deepset-ai/haystack/#installation) | How to install Haystack |
|
||||
| :art: [Key Components](https://github.com/deepset-ai/haystack/#key-components) | Overview of core concepts |
|
||||
| :mortar_board: [Tutorials](https://github.com/deepset-ai/haystack/#tutorials) | Jupyter/Colab Notebooks & Scripts |
|
||||
| :eyes: [How to use Haystack](https://github.com/deepset-ai/haystack/#how-to-use-haystack) | Basic explanation of concepts, options and usage |
|
||||
| :heart: [Contributing](https://github.com/deepset-ai/haystack/#heart-contributing) | We welcome all contributions! |
|
||||
| :bar_chart: [Benchmarks](https://haystack.deepset.ai/bm/benchmarks) | Speed & Accuracy of Retriever, Readers and DocumentStores |
|
||||
| :telescope: [Roadmap](https://haystack.deepset.ai/docs/latest/roadmapmd) | Public roadmap of Haystack |
|
||||
| :pray: [Slack](https://haystack.deepset.ai/community/join) | Join our community on Slack |
|
||||
| :bird: [Twitter](https://twitter.com/deepset_ai) | Follow us on Twitter for news and updates |
|
||||
| :newspaper: [Blog](https://medium.com/deepset-ai) | Read our articles on Medium |
|
||||
|
||||
|
||||
## Quick Demo
|
||||
|
||||
The quickest way to see what Haystack offers is to start a [Docker Compose](https://docs.docker.com/compose/) demo application:
|
||||
|
||||
**1. Update/install Docker and Docker Compose, then launch Docker**
|
||||
|
||||
```
|
||||
# apt-get update && apt-get install docker && apt-get install docker-compose
|
||||
# service docker start
|
||||
```
|
||||
|
||||
**2. Clone Haystack repository**
|
||||
|
||||
```
|
||||
# git clone https://github.com/deepset-ai/haystack.git
|
||||
```
|
@ -2,6 +2,7 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.file_converter import MarkdownConverter
|
||||
from haystack.file_converter.docx import DocxToTextConverter
|
||||
from haystack.file_converter.pdf import PDFToTextConverter
|
||||
from haystack.file_converter.tika import TikaConverter
|
||||
@ -50,3 +51,9 @@ def test_docx_converter():
|
||||
converter = DocxToTextConverter()
|
||||
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
|
||||
assert document["text"].startswith("Sample Docx File")
|
||||
|
||||
|
||||
def test_markdown_converter():
|
||||
converter = MarkdownConverter()
|
||||
document = converter.convert(file_path=Path("samples/markdown/sample.md"))
|
||||
assert document["text"].startswith("What to build with Haystack")
|
||||
|
Loading…
x
Reference in New Issue
Block a user