From 686a19b35df3a10e32e4ede6ea9128c95d930950 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Tue, 17 May 2022 17:53:04 +0200 Subject: [PATCH] added launch_tika method (#2567) * added launch_tika method * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- haystack/nodes/file_converter/tika.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/haystack/nodes/file_converter/tika.py b/haystack/nodes/file_converter/tika.py index 77b2caa24..a973f385b 100644 --- a/haystack/nodes/file_converter/tika.py +++ b/haystack/nodes/file_converter/tika.py @@ -1,7 +1,9 @@ from typing import List, Optional, Dict +import time import logging from pathlib import Path +import subprocess from html.parser import HTMLParser import requests @@ -12,6 +14,30 @@ from haystack.schema import Document logger = logging.getLogger(__name__) +TIKA_CONTAINER_NAME = "tika" + + +def launch_tika(sleep=15, delete_existing=False): + # Start a Tika server via Docker + + logger.debug("Starting Tika ...") + # This line is needed since it is not possible to start a new docker container with the name tika if there is a stopped image with the same name + # docker rm only succeeds if the container is stopped, not if it is running + if delete_existing: + _ = subprocess.run([f"docker rm --force {TIKA_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL) + status = subprocess.run( + [ + f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.24.1" + ], + shell=True, + ) + if status.returncode: + logger.warning( + "Tried to start Tika through Docker but this failed. " + "It is likely that there is already an existing Tika instance running. " + ) + else: + time.sleep(sleep) class TikaXHTMLParser(HTMLParser):