| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | import logging | 
					
						
							| 
									
										
										
										
											2020-06-24 15:05:30 +02:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | import shutil | 
					
						
							|  |  |  | import uuid | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							|  |  |  | from typing import Optional, List | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from fastapi import APIRouter | 
					
						
							|  |  |  | from fastapi import HTTPException | 
					
						
							|  |  |  | from fastapi import UploadFile, File, Form | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-22 12:07:12 +02:00
										 |  |  | from rest_api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, ES_CONN_SCHEME, TEXT_FIELD_NAME, \ | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  |     SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \ | 
					
						
							|  |  |  |     FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER | 
					
						
							|  |  |  | from haystack.database.elasticsearch import ElasticsearchDocumentStore | 
					
						
							|  |  |  | from haystack.indexing.file_converters.pdf import PDFToTextConverter | 
					
						
							|  |  |  | from haystack.indexing.file_converters.txt import TextConverter | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | logger = logging.getLogger(__name__) | 
					
						
							|  |  |  | router = APIRouter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | document_store = ElasticsearchDocumentStore( | 
					
						
							|  |  |  |     host=DB_HOST, | 
					
						
							|  |  |  |     port=DB_PORT, | 
					
						
							|  |  |  |     username=DB_USER, | 
					
						
							|  |  |  |     password=DB_PW, | 
					
						
							|  |  |  |     index=DB_INDEX, | 
					
						
							|  |  |  |     scheme=ES_CONN_SCHEME, | 
					
						
							|  |  |  |     ca_certs=False, | 
					
						
							|  |  |  |     verify_certs=False, | 
					
						
							|  |  |  |     text_field=TEXT_FIELD_NAME, | 
					
						
							|  |  |  |     search_fields=SEARCH_FIELD_NAME, | 
					
						
							|  |  |  |     embedding_dim=EMBEDDING_DIM, | 
					
						
							|  |  |  |     embedding_field=EMBEDDING_FIELD_NAME, | 
					
						
							|  |  |  |     excluded_meta_data=EXCLUDE_META_DATA_FIELDS,  # type: ignore | 
					
						
							|  |  |  |     faq_question_field=FAQ_QUESTION_FIELD_NAME, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-24 15:05:30 +02:00
										 |  |  | os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)  # create directory for uploading files | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | @router.post("/file-upload") | 
					
						
							|  |  |  | def upload_file_to_document_store( | 
					
						
							|  |  |  |     file: UploadFile = File(...), | 
					
						
							|  |  |  |     remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES), | 
					
						
							|  |  |  |     remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE), | 
					
						
							|  |  |  |     remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES), | 
					
						
							|  |  |  |     remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER), | 
					
						
							|  |  |  |     valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES), | 
					
						
							| 
									
										
										
										
											2020-07-06 17:35:47 +02:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  |     try: | 
					
						
							|  |  |  |         file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" | 
					
						
							|  |  |  |         with file_path.open("wb") as buffer: | 
					
						
							|  |  |  |             shutil.copyfileobj(file.file, buffer) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if file.filename.split(".")[-1].lower() == "pdf": | 
					
						
							|  |  |  |             pdf_converter = PDFToTextConverter( | 
					
						
							|  |  |  |                 remove_numeric_tables=remove_numeric_tables, | 
					
						
							|  |  |  |                 remove_whitespace=remove_whitespace, | 
					
						
							|  |  |  |                 remove_empty_lines=remove_empty_lines, | 
					
						
							|  |  |  |                 remove_header_footer=remove_header_footer, | 
					
						
							|  |  |  |                 valid_languages=valid_languages, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             pages = pdf_converter.extract_pages(file_path) | 
					
						
							|  |  |  |         elif file.filename.split(".")[-1].lower() == "txt": | 
					
						
							|  |  |  |             txt_converter = TextConverter( | 
					
						
							|  |  |  |                 remove_numeric_tables=remove_numeric_tables, | 
					
						
							|  |  |  |                 remove_whitespace=remove_whitespace, | 
					
						
							|  |  |  |                 remove_empty_lines=remove_empty_lines, | 
					
						
							|  |  |  |                 remove_header_footer=remove_header_footer, | 
					
						
							|  |  |  |                 valid_languages=valid_languages, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             pages = txt_converter.extract_pages(file_path) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename} | 
					
						
							|  |  |  |         document_store.write_documents([document]) | 
					
						
							| 
									
										
										
										
											2020-07-06 17:35:47 +02:00
										 |  |  |         return "File upload was successful." | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  |     finally: | 
					
						
							|  |  |  |         file.file.close() |