fix: update setup instructions (#144) #none

* activate directory to gitignore

* add my custom env to gitignore, will have to change that

* add unstructured to kotaemon pyproject.toml

* add .env to gitignore

* remove .env from tracking

* make changes to the run_macos script, update readme with more detailed instructions

* remove my personal changes from gitignore

* remove line from run_macos script

* remove option for not installing miniconda for non technical users, mark docker dependency as optional

* docs: update demo URL

* gitignore changes

* merge .env.example

* revert changes to run_macos.sh

* unstructured to advanced dependencies

* add link to unstructured system dependencies

* remove api key

* fix: skip tests when unstructured pdf not installed

* chore: loosen unstructured package version in pyproject.toml

* chore: correct syntax

---------

Co-authored-by: Tadashi <tadashi@cinnamon.is>
Co-authored-by: cin-albert <albert@cinnamon.is>
This commit is contained in:
Ben Dykstra 2024-09-29 09:26:02 -06:00 committed by GitHub
parent 1522a3ab5a
commit f7b6f313b5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 42 additions and 16 deletions

View File

@ -1,8 +1,10 @@
# this is an example .env file, use it to create your own .env file and place it in the root of the project
# settings for OpenAI # settings for OpenAI
OPENAI_API_BASE=https://api.openai.com/v1 OPENAI_API_BASE=https://api.openai.com/v1
OPENAI_API_KEY=openai_key OPENAI_API_KEY=<YOUR OPEN AI KEY HERE>
OPENAI_CHAT_MODEL=gpt-4o OPENAI_CHAT_MODEL=gpt-3.5-turbo
OPENAI_EMBEDDINGS_MODEL=text-embedding-3-small OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
# settings for Azure OpenAI # settings for Azure OpenAI
AZURE_OPENAI_ENDPOINT= AZURE_OPENAI_ENDPOINT=
@ -12,14 +14,14 @@ AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002
# settings for Cohere # settings for Cohere
COHERE_API_KEY= COHERE_API_KEY=<COHERE API KEY>
# settings for local models # settings for local models
LOCAL_MODEL=llama3.1:8b LOCAL_MODEL=llama3.1:8b
LOCAL_MODEL_EMBEDDINGS=nomic-embed-text LOCAL_MODEL_EMBEDDINGS=nomic-embed-text
# settings for GraphRAG # settings for GraphRAG
GRAPHRAG_API_KEY=openai_key GRAPHRAG_API_KEY=<YOUR OPEN AI KEY HERE>
GRAPHRAG_LLM_MODEL=gpt-4o-mini GRAPHRAG_LLM_MODEL=gpt-4o-mini
GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small

5
.gitignore vendored
View File

@ -1,6 +1,11 @@
# Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
activate*
activate/*
kotaemon-env*
.env
### Emacs ### ### Emacs ###
# -*- mode: gitignore; -*- # -*- mode: gitignore; -*-
*~ *~

1
.python-version Normal file
View File

@ -0,0 +1 @@
3.10

View File

@ -85,6 +85,15 @@ Use the most recent release `.zip` to include latest features and bug-fixes.
### For developers ### For developers
#### System requirements
1. Python >=3.10
2. (optional) [Docker](https://www.docker.com/)
#### If you would like to process files other than .pdf, .html, .mhtml, and .xlsx documents:
You will need to install the system dependencies of [unstructured](https://docs.unstructured.io/open-source/installation/full-installation#full-installation). The installations vary by operating system, so please go to the link and follow the instructions there.
#### With Docker (recommended) #### With Docker (recommended)
We support `lite` & `full` version of Docker images. With `full`, the extra packages of `unstructured` will be installed as We support `lite` & `full` version of Docker images. With `full`, the extra packages of `unstructured` will be installed as
@ -141,9 +150,12 @@ cd kotaemon
pip install -e "libs/kotaemon[all]" pip install -e "libs/kotaemon[all]"
pip install -e "libs/ktem" pip install -e "libs/ktem"
``` ```
- View and edit your environment variables (API keys, end-points) in `.env`. - Create a .env file in the root of this project. Use .env.example as a template
The .env file is there to serve use cases where users want to pre-config the models before starting up the app (e.g. deploy the app on HF hub). The file will only be used to populate the db once upon the first run, it will no longer be used in consequent runs.
- (Optional) To enable in-browser PDF_JS viewer, download [PDF_JS_DIST](https://github.com/mozilla/pdf.js/releases/download/v4.0.379/pdfjs-4.0.379-dist.zip) and extract it to `libs/ktem/ktem/assets/prebuilt` - (Optional) To enable in-browser PDF_JS viewer, download [PDF_JS_DIST](https://github.com/mozilla/pdf.js/releases/download/v4.0.379/pdfjs-4.0.379-dist.zip) and extract it to `libs/ktem/ktem/assets/prebuilt`
@ -161,6 +173,8 @@ Default username / password are: `admin` / `admin`. You can setup additional use
![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png) ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png)
- Check the Resources tab and LLMs and Embeddings and ensure that your `api_key` value is set correctly from your `.env`. file. If it is not set, you can set it here.
## Setup local models (for local / private RAG) ## Setup local models (for local / private RAG)
See [Local model setup](docs/local_model.md). See [Local model setup](docs/local_model.md).

View File

@ -52,7 +52,7 @@ dependencies = [
"python-dotenv>=1.0.1,<1.1", "python-dotenv>=1.0.1,<1.1",
"tenacity>=8.2.3,<8.3", "tenacity>=8.2.3,<8.3",
"theflow>=0.8.6,<0.9.0", "theflow>=0.8.6,<0.9.0",
"trogon>=0.5.0,<0.6", "trogon>=0.5.0,<0.6"
] ]
readme = "README.md" readme = "README.md"
authors = [ authors = [
@ -73,11 +73,14 @@ adv = [
"fastembed", "fastembed",
"googlesearch-python>=1.2.4,<1.3", "googlesearch-python>=1.2.4,<1.3",
"llama-cpp-python<0.2.8", "llama-cpp-python<0.2.8",
"sentence-transformers",
"wikipedia>=1.4.0,<1.5",
"llama-index>=0.10.40,<0.11.0", "llama-index>=0.10.40,<0.11.0",
"llama-index-vector-stores-milvus", "llama-index-vector-stores-milvus",
"llama-index-vector-stores-qdrant", "llama-index-vector-stores-qdrant",
"python-docx>=1.1.0,<1.2",
"sentence-transformers",
"tabulate",
"unstructured>=0.15.8,<0.16",
"wikipedia>=1.4.0,<1.5",
] ]
dev = [ dev = [
"black", "black",

View File

@ -42,9 +42,10 @@ def if_sentence_fastembed_not_installed():
return False return False
def if_unstructured_not_installed(): def if_unstructured_pdf_not_installed():
try: try:
import unstructured # noqa: F401 import unstructured # noqa: F401
from unstructured.partition.pdf import partition_pdf # noqa: F401
except ImportError: except ImportError:
return True return True
else: else:
@ -81,8 +82,8 @@ skip_when_fastembed_not_installed = pytest.mark.skipif(
if_sentence_fastembed_not_installed(), reason="fastembed is not installed" if_sentence_fastembed_not_installed(), reason="fastembed is not installed"
) )
skip_when_unstructured_not_installed = pytest.mark.skipif( skip_when_unstructured_pdf_not_installed = pytest.mark.skipif(
if_unstructured_not_installed(), reason="unstructured is not installed" if_unstructured_pdf_not_installed(), reason="unstructured is not installed"
) )
skip_when_cohere_not_installed = pytest.mark.skipif( skip_when_cohere_not_installed = pytest.mark.skipif(

View File

@ -14,7 +14,7 @@ from kotaemon.loaders import (
UnstructuredReader, UnstructuredReader,
) )
from .conftest import skip_when_unstructured_not_installed from .conftest import skip_when_unstructured_pdf_not_installed
def test_docx_reader(): def test_docx_reader():
@ -54,7 +54,7 @@ def test_pdf_reader():
assert len(nodes) > 0 assert len(nodes) > 0
@skip_when_unstructured_not_installed @skip_when_unstructured_pdf_not_installed
def test_unstructured_pdf_reader(): def test_unstructured_pdf_reader():
reader = UnstructuredReader() reader = UnstructuredReader()
dirpath = Path(__file__).parent dirpath = Path(__file__).parent

View File

@ -5,7 +5,7 @@ import pytest
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
from .conftest import skip_when_unstructured_not_installed from .conftest import skip_when_unstructured_pdf_not_installed
input_file = Path(__file__).parent / "resources" / "table.pdf" input_file = Path(__file__).parent / "resources" / "table.pdf"
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx" input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
@ -28,7 +28,7 @@ def mathpix_output():
return content return content
@skip_when_unstructured_not_installed @skip_when_unstructured_pdf_not_installed
def test_ocr_reader(fullocr_output): def test_ocr_reader(fullocr_output):
reader = OCRReader() reader = OCRReader()
documents = reader.load_data(input_file, response_content=fullocr_output) documents = reader.load_data(input_file, response_content=fullocr_output)