added launch_tika method (#2567)

* added launch_tika method

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Stefano Fiorucci 2022-05-17 17:53:04 +02:00 committed by GitHub
parent 5a1e98e3ff
commit 686a19b35d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,7 +1,9 @@
from typing import List, Optional, Dict from typing import List, Optional, Dict
import time
import logging import logging
from pathlib import Path from pathlib import Path
import subprocess
from html.parser import HTMLParser from html.parser import HTMLParser
import requests import requests
@ -12,6 +14,30 @@ from haystack.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
TIKA_CONTAINER_NAME = "tika"
def launch_tika(sleep=15, delete_existing=False):
# Start a Tika server via Docker
logger.debug("Starting Tika ...")
# This line is needed since it is not possible to start a new docker container with the name tika if there is a stopped image with the same name
# docker rm only succeeds if the container is stopped, not if it is running
if delete_existing:
_ = subprocess.run([f"docker rm --force {TIKA_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL)
status = subprocess.run(
[
f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.24.1"
],
shell=True,
)
if status.returncode:
logger.warning(
"Tried to start Tika through Docker but this failed. "
"It is likely that there is already an existing Tika instance running. "
)
else:
time.sleep(sleep)
class TikaXHTMLParser(HTMLParser): class TikaXHTMLParser(HTMLParser):