feat: add web URL loader & refine indexing logics (#397)

* feat: add web URL loader & refine indexing logics * fix: comfort mypy
2025-06-26 23:19:56 +00:00 · 2024-10-15 22:42:24 +07:00 · 2024-10-15 22:42:24 +07:00 · b113efc855
commit b113efc855
parent 41966fcd5b
7 changed files with 183 additions and 60 deletions
--- a/libs/kotaemon/kotaemon/indices/ingests/files.py
+++ b/libs/kotaemon/kotaemon/indices/ingests/files.py
@ -21,8 +21,10 @@ from kotaemon.loaders import (
    PDFThumbnailReader,
    TxtReader,
    UnstructuredReader,
+    WebReader,
 )

+web_reader = WebReader()
 unstructured = UnstructuredReader()
 adobe_reader = AdobeReader()
 azure_reader = AzureAIDocumentIntelligenceLoader(
--- a/libs/kotaemon/kotaemon/loaders/init.py
+++ b/libs/kotaemon/kotaemon/loaders/init.py
@ -10,6 +10,7 @@ from .ocr_loader import ImageReader, OCRReader
 from .pdf_loader import PDFThumbnailReader
 from .txt_loader import TxtReader
 from .unstructured_loader import UnstructuredReader
+from .web_loader import WebReader

 __all__ = [
    "AutoReader",
@ -28,4 +29,5 @@ __all__ = [
    "AdobeReader",
    "TxtReader",
    "PDFThumbnailReader",
+    "WebReader",
 ]
--- a/libs/kotaemon/kotaemon/loaders/web_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/web_loader.py
@ -0,0 +1,43 @@
+from pathlib import Path
+from typing import Optional
+
+import requests
+from decouple import config
+
+from kotaemon.base import Document
+
+from .base import BaseReader
+
+JINA_API_KEY = config("JINA_API_KEY", default="")
+JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
+
+
+class WebReader(BaseReader):
+    def run(
+        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
+
+    def fetch_url(self, url: str):
+        # setup the request
+        api_url = f"https://r.jina.ai/{url}"
+        headers = {
+            "X-With-Links-Summary": "true",
+        }
+        if JINA_API_KEY:
+            headers["Authorization"] = f"Bearer {JINA_API_KEY}"
+
+        response = requests.get(api_url, headers=headers)
+        response.raise_for_status()
+
+        data = response.text
+        return data
+
+    def load_data(
+        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        file_path = str(file_path)
+        output = self.fetch_url(file_path)
+        metadata = extra_info or {}
+
+        return [Document(text=output, metadata=metadata)]
--- a/libs/ktem/ktem/index/file/graph/pipelines.py
+++ b/libs/ktem/ktem/index/file/graph/pipelines.py
@ -57,7 +57,7 @@ def prepare_graph_index_path(graph_id: str):
 class GraphRAGIndexingPipeline(IndexDocumentPipeline):
    """GraphRAG specific indexing pipeline"""

-    def route(self, file_path: Path) -> IndexPipeline:
+    def route(self, file_path: str | Path) -> IndexPipeline:
        """Simply disable the splitter (chunking) for this pipeline"""
        pipeline = super().route(file_path)
        pipeline.splitter = None
--- a/libs/ktem/ktem/index/file/knet/pipelines.py
+++ b/libs/ktem/ktem/index/file/knet/pipelines.py
@ -32,7 +32,7 @@ class KnetIndexingPipeline(IndexDocumentPipeline):
            },
        }

-    def route(self, file_path: Path) -> IndexPipeline:
+    def route(self, file_path: str | Path) -> IndexPipeline:
        """Simply disable the splitter (chunking) for this pipeline"""
        pipeline = super().route(file_path)
        pipeline.splitter = None
--- a/libs/ktem/ktem/index/file/pipelines.py
+++ b/libs/ktem/ktem/index/file/pipelines.py
@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import (
    adobe_reader,
    azure_reader,
    unstructured,
+    web_reader,
 )
 from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
 from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
@ -444,7 +445,7 @@ class IndexPipeline(BaseComponent):
                session.add_all(nodes)
                session.commit()

-    def get_id_if_exists(self, file_path: Path) -> Optional[str]:
+    def get_id_if_exists(self, file_path: str | Path) -> Optional[str]:
        """Check if the file is already indexed

        Args:
@ -453,13 +454,14 @@ class IndexPipeline(BaseComponent):
        Returns:
            the file id if the file is indexed, otherwise None
        """
+        file_name = file_path.name if isinstance(file_path, Path) else file_path
        if self.private:
            cond: tuple = (
-                self.Source.name == file_path.name,
+                self.Source.name == file_name,
                self.Source.user == self.user_id,
            )
        else:
-            cond = (self.Source.name == file_path.name,)
+            cond = (self.Source.name == file_name,)

        with Session(engine) as session:
            stmt = select(self.Source).where(*cond)
@ -469,6 +471,29 @@ class IndexPipeline(BaseComponent):

        return None

+    def store_url(self, url: str) -> str:
+        """Store URL into the database and storage, return the file id
+
+        Args:
+            url: the URL
+
+        Returns:
+            the file id
+        """
+        file_hash = sha256(url.encode()).hexdigest()
+        source = self.Source(
+            name=url,
+            path=file_hash,
+            size=0,
+            user=self.user_id,  # type: ignore
+        )
+        with Session(engine) as session:
+            session.add(source)
+            session.commit()
+            file_id = source.id
+
+        return file_id
+
    def store_file(self, file_path: Path) -> str:
        """Store file into the database and storage, return the file id

@ -495,7 +520,7 @@ class IndexPipeline(BaseComponent):

        return file_id

-    def finish(self, file_id: str, file_path: Path) -> str:
+    def finish(self, file_id: str, file_path: str | Path) -> str:
        """Finish the indexing"""
        with Session(engine) as session:
            stmt = select(self.Source).where(self.Source.id == file_id)
@ -561,37 +586,55 @@ class IndexPipeline(BaseComponent):
    def stream(
        self, file_path: str | Path, reindex: bool, **kwargs
    ) -> Generator[Document, None, tuple[str, list[Document]]]:
-        # check for duplication
-        file_path = Path(file_path).resolve()
+        # check if the file is already indexed
+        if isinstance(file_path, Path):
+            file_path = file_path.resolve()
+
        file_id = self.get_id_if_exists(file_path)
-        if file_id is not None:
-            if not reindex:
-                raise ValueError(
-                    f"File {file_path.name} already indexed. Please rerun with "
-                    "reindex=True to force reindexing."
-                )
+
+        if isinstance(file_path, Path):
+            if file_id is not None:
+                if not reindex:
+                    raise ValueError(
+                        f"File {file_path.name} already indexed. Please rerun with "
+                        "reindex=True to force reindexing."
+                    )
+                else:
+                    # remove the existing records
+                    yield Document(
+                        f" => Removing old {file_path.name}", channel="debug"
+                    )
+                    self.delete_file(file_id)
+                    file_id = self.store_file(file_path)
            else:
-                # remove the existing records
-                yield Document(f" => Removing old {file_path.name}", channel="debug")
-                self.delete_file(file_id)
+                # add record to db
                file_id = self.store_file(file_path)
        else:
-            # add record to db
-            file_id = self.store_file(file_path)
+            if file_id is not None:
+                raise ValueError(f"URL {file_path} already indexed.")
+            else:
+                # add record to db
+                file_id = self.store_url(file_path)

        # extract the file
-        extra_info = default_file_metadata_func(str(file_path))
+        if isinstance(file_path, Path):
+            extra_info = default_file_metadata_func(str(file_path))
+            file_name = file_path.name
+        else:
+            extra_info = {"file_name": file_path}
+            file_name = file_path
+
        extra_info["file_id"] = file_id
        extra_info["collection_name"] = self.collection_name

-        yield Document(f" => Converting {file_path.name} to text", channel="debug")
+        yield Document(f" => Converting {file_name} to text", channel="debug")
        docs = self.loader.load_data(file_path, extra_info=extra_info)
-        yield Document(f" => Converted {file_path.name} to text", channel="debug")
-        yield from self.handle_docs(docs, file_id, file_path.name)
+        yield Document(f" => Converted {file_name} to text", channel="debug")
+        yield from self.handle_docs(docs, file_id, file_name)

        self.finish(file_id, file_path)

-        yield Document(f" => Finished indexing {file_path.name}", channel="debug")
+        yield Document(f" => Finished indexing {file_name}", channel="debug")
        return file_id, docs


@ -658,20 +701,30 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
        )
        return obj

-    def route(self, file_path: Path) -> IndexPipeline:
+    def is_url(self, file_path: str | Path) -> bool:
+        return isinstance(file_path, str) and (
+            file_path.startswith("http://") or file_path.startswith("https://")
+        )
+
+    def route(self, file_path: str | Path) -> IndexPipeline:
        """Decide the pipeline based on the file type

        Can subclass this method for a more elaborate pipeline routing strategy.
        """
        _, chunk_size, chunk_overlap = dev_settings()

-        ext = file_path.suffix.lower()
-        reader = self.readers.get(ext, unstructured)
-        if reader is None:
-            raise NotImplementedError(
-                f"No supported pipeline to index {file_path.name}. Please specify "
-                "the suitable pipeline for this file type in the settings."
-            )
+        # check if file_path is a URL
+        if self.is_url(file_path):
+            reader = web_reader
+        else:
+            assert isinstance(file_path, Path)
+            ext = file_path.suffix.lower()
+            reader = self.readers.get(ext, unstructured)
+            if reader is None:
+                raise NotImplementedError(
+                    f"No supported pipeline to index {file_path.name}. Please specify "
+                    "the suitable pipeline for this file type in the settings."
+                )

        print("Using reader", reader)
        pipeline: IndexPipeline = IndexPipeline(
@ -715,9 +768,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):

        n_files = len(file_paths)
        for idx, file_path in enumerate(file_paths):
-            file_path = Path(file_path)
+            if self.is_url(file_path):
+                file_name = file_path
+            else:
+                file_path = Path(file_path)
+                file_name = file_path.name
+
            yield Document(
-                content=f"Indexing [{idx + 1}/{n_files}]: {file_path.name}",
+                content=f"Indexing [{idx + 1}/{n_files}]: {file_name}",
                channel="debug",
            )

@ -730,7 +788,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
                file_ids.append(file_id)
                errors.append(None)
                yield Document(
-                    content={"file_path": file_path, "status": "success"},
+                    content={
+                        "file_path": file_path,
+                        "file_name": file_name,
+                        "status": "success",
+                    },
                    channel="index",
                )
            except Exception as e:
@ -740,6 +802,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
                yield Document(
                    content={
                        "file_path": file_path,
+                        "file_name": file_name,
                        "status": "failed",
                        "message": str(e),
                    },
--- a/libs/ktem/ktem/index/file/ui.py
+++ b/libs/ktem/ktem/index/file/ui.py
@ -111,18 +111,25 @@ class FileIndexPage(BasePage):
        """Build the UI of the app"""
        with gr.Row():
            with gr.Column(scale=1):
-                gr.Markdown("## File Upload")
                with gr.Column() as self.upload:
-                    self.files = File(
-                        file_types=self._supported_file_types,
-                        file_count="multiple",
-                        container=True,
-                        show_label=False,
-                    )
+                    with gr.Tab("Upload Files"):
+                        self.files = File(
+                            file_types=self._supported_file_types,
+                            file_count="multiple",
+                            container=True,
+                            show_label=False,
+                        )

-                    msg = self.upload_instruction()
-                    if msg:
-                        gr.Markdown(msg)
+                        msg = self.upload_instruction()
+                        if msg:
+                            gr.Markdown(msg)
+
+                    with gr.Tab("Use Web Links"):
+                        self.urls = gr.Textbox(
+                            label="Input web URLs",
+                            lines=8,
+                        )
+                        gr.Markdown("(separated by new line)")

                    with gr.Accordion("Advanced indexing options", open=True):
                        with gr.Row():
@ -525,6 +532,7 @@ class FileIndexPage(BasePage):
            fn=self.index_fn,
            inputs=[
                self.files,
+                self.urls,
                self.reindex,
                self._app.settings_state,
                self._app.user_id,
@ -670,28 +678,33 @@ class FileIndexPage(BasePage):
        return remaining_files

    def index_fn(
-        self, files, reindex: bool, settings, user_id
+        self, files, urls, reindex: bool, settings, user_id
    ) -> Generator[tuple[str, str], None, None]:
        """Upload and index the files

        Args:
            files: the list of files to be uploaded
+            urls: list of web URLs to be indexed
            reindex: whether to reindex the files
            selected_files: the list of files already selected
            settings: the settings of the app
        """
-        if not files:
-            gr.Info("No uploaded file")
-            yield "", ""
-            return
+        if urls:
+            files = [it.strip() for it in urls.split("\n")]
+            errors = []
+        else:
+            if not files:
+                gr.Info("No uploaded file")
+                yield "", ""
+                return

-        files = self._may_extract_zip(files, flowsettings.KH_ZIP_INPUT_DIR)
+            files = self._may_extract_zip(files, flowsettings.KH_ZIP_INPUT_DIR)

-        errors = self.validate(files)
-        if errors:
-            gr.Warning(", ".join(errors))
-            yield "", ""
-            return
+            errors = self.validate(files)
+            if errors:
+                gr.Warning(", ".join(errors))
+                yield "", ""
+                return

        gr.Info(f"Start indexing {len(files)} files...")

@ -708,10 +721,10 @@ class FileIndexPage(BasePage):
                    continue
                if response.channel == "index":
                    if response.content["status"] == "success":
-                        outputs.append(f"\u2705 | {response.content['file_path'].name}")
+                        outputs.append(f"\u2705 | {response.content['file_name']}")
                    elif response.content["status"] == "failed":
                        outputs.append(
-                            f"\u274c | {response.content['file_path'].name}: "
+                            f"\u274c | {response.content['file_name']}: "
                            f"{response.content['message']}"
                        )
                elif response.channel == "debug":
@ -764,7 +777,7 @@ class FileIndexPage(BasePage):
        settings[f"index.options.{self._index.id}.reader_mode"] = "default"
        settings[f"index.options.{self._index.id}.quick_index_mode"] = True
        if to_process_files:
-            _iter = self.index_fn(to_process_files, reindex, settings, user_id)
+            _iter = self.index_fn(to_process_files, [], reindex, settings, user_id)
            try:
                while next(_iter):
                    pass
@ -844,7 +857,7 @@ class FileIndexPage(BasePage):
            for p in exclude_patterns:
                files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]

-        yield from self.index_fn(files, reindex, settings, user_id)
+        yield from self.index_fn(files, [], reindex, settings, user_id)

    def format_size_human_readable(self, num: float | str, suffix="B"):
        try: