mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-29 10:56:40 +00:00

* rename database to documentstore * move document, label, multilabel to haystack/schema.py * rename documentstore -> document_store * split indexing modules -> file_converter + preprocessor * fix order of imports * Update tutorial notebooks * fix torch version in tutorial 4
11 lines
474 B
Python
11 lines
474 B
Python
from pathlib import Path
|
|
|
|
from haystack.file_converter.docx import DocxToTextConverter
|
|
|
|
|
|
def test_extract_pages():
|
|
converter = DocxToTextConverter()
|
|
paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx"))
|
|
assert len(paragraphs) == 8 # Sample has 8 Paragraphs
|
|
assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'
|