ragflow/sdk/python/test/test_sdk_api/t_chunk.py

from ragflow_sdk import RAGFlow
from common import HOST_ADDRESS
from time import sleep

def test_parse_document_with_txt(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_parse_document")
    name = 'ragflow_test.txt'
    with open("test_data/ragflow_test.txt", "rb") as file :
        blob = file.read()
    docs = ds.upload_documents([{"displayed_name": name, "blob": blob}])
    doc = docs[0]
    ds.async_parse_documents(document_ids=[doc.id])
    '''
    for n in range(100):
        if doc.progress == 1:
            break
        sleep(1)
    else:
        raise Exception("Run time ERROR: Document parsing did not complete in time.")
    '''

def test_parse_and_cancel_document(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_parse_and_cancel_document")
    name = 'ragflow_test.txt'
    with open("test_data/ragflow_test.txt", "rb") as file :
        blob = file.read()
    docs=ds.upload_documents([{"displayed_name": name, "blob": blob}])
    doc = docs[0]
    ds.async_parse_documents(document_ids=[doc.id])
    sleep(1)
    if 0 < doc.progress < 1:
        ds.async_cancel_parse_documents(document_ids=[doc.id])


def test_bulk_parse_documents(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_bulk_parse_and_cancel_documents")
    with open("test_data/ragflow.txt", "rb") as file:
        blob = file.read()
    documents = [
        {'displayed_name': 'test1.txt', 'blob': blob},
        {'displayed_name': 'test2.txt', 'blob': blob},
        {'displayed_name': 'test3.txt', 'blob': blob}
    ]
    docs = ds.upload_documents(documents)
    ids = [doc.id for doc in docs]
    ds.async_parse_documents(ids)
    '''
    for n in range(100):
        all_completed = all(doc.progress == 1 for doc in docs)
        if all_completed:
            break
        sleep(1)
    else:
        raise Exception("Run time ERROR: Bulk document parsing did not complete in time.")
    '''

def test_list_chunks_with_success(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_list_chunks_with_success")
    with open("test_data/ragflow_test.txt", "rb") as file:
        blob = file.read()
    '''
    # chunk_size = 1024 * 1024
    # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
    documents = [
        {'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
    ]
    '''
    documents =[{"displayed_name":"test_list_chunks_with_success.txt","blob":blob}]
    docs = ds.upload_documents(documents)
    ids = [doc.id for doc in docs]
    ds.async_parse_documents(ids)
    '''
    for n in range(100):
        all_completed = all(doc.progress == 1 for doc in docs)
        if all_completed:
            break
        sleep(1)
    else:
        raise Exception("Run time ERROR: Chunk document parsing did not complete in time.")
    '''
    doc = docs[0]
    doc.list_chunks()


def test_add_chunk_with_success(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_add_chunk_with_success")
    with open("test_data/ragflow_test.txt", "rb") as file:
        blob = file.read()
    '''
    # chunk_size = 1024 * 1024
    # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
    documents = [
        {'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
    ]
    '''
    documents =[{"displayed_name":"test_list_chunks_with_success.txt","blob":blob}]
    docs = ds.upload_documents(documents)
    doc = docs[0]
    doc.add_chunk(content="This is a chunk addition test")


def test_delete_chunk_with_success(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_delete_chunk_with_success")
    with open("test_data/ragflow_test.txt", "rb") as file:
        blob = file.read()
    '''
    # chunk_size = 1024 * 1024
    # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
    documents = [
        {'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
    ]
    '''
    documents =[{"displayed_name":"test_delete_chunk_with_success.txt","blob":blob}]
    docs = ds.upload_documents(documents)
    doc = docs[0]
    chunk = doc.add_chunk(content="This is a chunk addition test")
    sleep(5)
    doc.delete_chunks([chunk.id])


def test_update_chunk_content(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_update_chunk_content_with_success")
    with open("test_data/ragflow_test.txt", "rb") as file:
        blob = file.read()
    '''
    # chunk_size = 1024 * 1024
    # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
    documents = [
        {'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
    ]
    '''
    documents =[{"displayed_name":"test_update_chunk_content_with_success.txt","blob":blob}]
    docs = ds.upload_documents(documents)
    doc = docs[0]
    chunk = doc.add_chunk(content="This is a chunk addition test")
    # For Elasticsearch, the chunk is not searchable in shot time (~2s).
    sleep(3)
    chunk.update({"content":"This is a updated content"})

def test_update_chunk_available(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="test_update_chunk_available_with_success")
    with open("test_data/ragflow_test.txt", "rb") as file:
        blob = file.read()
    '''
    # chunk_size = 1024 * 1024
    # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
    documents = [
        {'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
    ]
    '''
    documents =[{"displayed_name":"test_update_chunk_available_with_success.txt","blob":blob}]
    docs = ds.upload_documents(documents)
    doc = docs[0]
    chunk = doc.add_chunk(content="This is a chunk addition test")
    # For Elasticsearch, the chunk is not searchable in shot time (~2s).
    sleep(3)
    chunk.update({"available":0})


def test_retrieve_chunks(get_api_key_fixture):
    API_KEY = get_api_key_fixture
    rag = RAGFlow(API_KEY, HOST_ADDRESS)
    ds = rag.create_dataset(name="retrieval")
    with open("test_data/ragflow_test.txt", "rb") as file:
        blob = file.read()
    '''
    # chunk_size = 1024 * 1024
    # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
    documents = [
        {'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
    ]
    '''
    documents =[{"displayed_name":"test_retrieve_chunks.txt","blob":blob}]
    docs = ds.upload_documents(documents)
    doc = docs[0]
    doc.add_chunk(content="This is a chunk addition test")
    rag.retrieve(dataset_ids=[ds.id],document_ids=[doc.id])
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00			`from ragflow_sdk import RAGFlow`
			`from common import HOST_ADDRESS`
			`from time import sleep`

			`def test_parse_document_with_txt(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_parse_document")`
			`name = 'ragflow_test.txt'`
Add tests for frontend API (#3552) ### What problem does this PR solve? Add tests for frontend API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> 2024-11-21 15:39:25 +08:00			`with open("test_data/ragflow_test.txt", "rb") as file :`
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00			`blob = file.read()`
			`docs = ds.upload_documents([{"displayed_name": name, "blob": blob}])`
			`doc = docs[0]`
			`ds.async_parse_documents(document_ids=[doc.id])`
			`'''`
			`for n in range(100):`
			`if doc.progress == 1:`
			`break`
			`sleep(1)`
			`else:`
			`raise Exception("Run time ERROR: Document parsing did not complete in time.")`
			`'''`

			`def test_parse_and_cancel_document(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_parse_and_cancel_document")`
			`name = 'ragflow_test.txt'`
Add tests for frontend API (#3552) ### What problem does this PR solve? Add tests for frontend API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> 2024-11-21 15:39:25 +08:00			`with open("test_data/ragflow_test.txt", "rb") as file :`
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00			`blob = file.read()`
			`docs=ds.upload_documents([{"displayed_name": name, "blob": blob}])`
			`doc = docs[0]`
			`ds.async_parse_documents(document_ids=[doc.id])`
			`sleep(1)`
			`if 0 < doc.progress < 1:`
			`ds.async_cancel_parse_documents(document_ids=[doc.id])`


			`def test_bulk_parse_documents(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_bulk_parse_and_cancel_documents")`
Add tests for frontend API (#3552) ### What problem does this PR solve? Add tests for frontend API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> 2024-11-21 15:39:25 +08:00			`with open("test_data/ragflow.txt", "rb") as file:`
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00			`blob = file.read()`
			`documents = [`
			`{'displayed_name': 'test1.txt', 'blob': blob},`
			`{'displayed_name': 'test2.txt', 'blob': blob},`
			`{'displayed_name': 'test3.txt', 'blob': blob}`
			`]`
			`docs = ds.upload_documents(documents)`
			`ids = [doc.id for doc in docs]`
			`ds.async_parse_documents(ids)`
			`'''`
			`for n in range(100):`
			`all_completed = all(doc.progress == 1 for doc in docs)`
			`if all_completed:`
			`break`
			`sleep(1)`
			`else:`
			`raise Exception("Run time ERROR: Bulk document parsing did not complete in time.")`
			`'''`

			`def test_list_chunks_with_success(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_list_chunks_with_success")`
			`with open("test_data/ragflow_test.txt", "rb") as file:`
			`blob = file.read()`
			`'''`
			`# chunk_size = 1024 * 1024`
			`# chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]`
			`documents = [`
			`{'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)`
			`]`
			`'''`
			`documents =[{"displayed_name":"test_list_chunks_with_success.txt","blob":blob}]`
			`docs = ds.upload_documents(documents)`
			`ids = [doc.id for doc in docs]`
			`ds.async_parse_documents(ids)`
			`'''`
			`for n in range(100):`
			`all_completed = all(doc.progress == 1 for doc in docs)`
			`if all_completed:`
			`break`
			`sleep(1)`
			`else:`
			`raise Exception("Run time ERROR: Chunk document parsing did not complete in time.")`
			`'''`
			`doc = docs[0]`
			`doc.list_chunks()`


			`def test_add_chunk_with_success(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_add_chunk_with_success")`
			`with open("test_data/ragflow_test.txt", "rb") as file:`
			`blob = file.read()`
			`'''`
			`# chunk_size = 1024 * 1024`
			`# chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]`
			`documents = [`
			`{'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)`
			`]`
			`'''`
			`documents =[{"displayed_name":"test_list_chunks_with_success.txt","blob":blob}]`
			`docs = ds.upload_documents(documents)`
			`doc = docs[0]`
			`doc.add_chunk(content="This is a chunk addition test")`


			`def test_delete_chunk_with_success(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_delete_chunk_with_success")`
			`with open("test_data/ragflow_test.txt", "rb") as file:`
			`blob = file.read()`
			`'''`
			`# chunk_size = 1024 * 1024`
			`# chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]`
			`documents = [`
			`{'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)`
			`]`
			`'''`
fix bugs in test (#3196) ### What problem does this PR solve? fix bugs in test ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> 2024-11-04 20:03:14 +08:00			`documents =[{"displayed_name":"test_delete_chunk_with_success.txt","blob":blob}]`
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00			`docs = ds.upload_documents(documents)`
			`doc = docs[0]`
			`chunk = doc.add_chunk(content="This is a chunk addition test")`
Integration with Infinity (#2894) ### What problem does this PR solve? Integration with Infinity - Replaced ELASTICSEARCH with dataStoreConn - Renamed deleteByQuery with delete - Renamed bulk to upsertBulk - getHighlight, getAggregation - Fix KGSearch.search - Moved Dealer.sql_retrieval to es_conn.py ### Type of change - [x] Refactoring 2024-11-12 14:59:41 +08:00			`sleep(5)`
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00			`doc.delete_chunks([chunk.id])`


			`def test_update_chunk_content(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_update_chunk_content_with_success")`
			`with open("test_data/ragflow_test.txt", "rb") as file:`
			`blob = file.read()`
			`'''`
			`# chunk_size = 1024 * 1024`
			`# chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]`
			`documents = [`
			`{'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)`
			`]`
			`'''`
			`documents =[{"displayed_name":"test_update_chunk_content_with_success.txt","blob":blob}]`
			`docs = ds.upload_documents(documents)`
			`doc = docs[0]`
			`chunk = doc.add_chunk(content="This is a chunk addition test")`
Added doc for switching elasticsearch to infinity (#3370) ### What problem does this PR solve? Added doc for switching elasticsearch to infinity ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update 2024-11-14 00:08:55 +08:00			`# For Elasticsearch, the chunk is not searchable in shot time (~2s).`
Integration with Infinity (#2894) ### What problem does this PR solve? Integration with Infinity - Replaced ELASTICSEARCH with dataStoreConn - Renamed deleteByQuery with delete - Renamed bulk to upsertBulk - getHighlight, getAggregation - Fix KGSearch.search - Moved Dealer.sql_retrieval to es_conn.py ### Type of change - [x] Refactoring 2024-11-12 14:59:41 +08:00			`sleep(3)`
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00			`chunk.update({"content":"This is a updated content"})`

			`def test_update_chunk_available(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="test_update_chunk_available_with_success")`
			`with open("test_data/ragflow_test.txt", "rb") as file:`
			`blob = file.read()`
			`'''`
			`# chunk_size = 1024 * 1024`
			`# chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]`
			`documents = [`
			`{'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)`
			`]`
			`'''`
			`documents =[{"displayed_name":"test_update_chunk_available_with_success.txt","blob":blob}]`
			`docs = ds.upload_documents(documents)`
			`doc = docs[0]`
			`chunk = doc.add_chunk(content="This is a chunk addition test")`
Added doc for switching elasticsearch to infinity (#3370) ### What problem does this PR solve? Added doc for switching elasticsearch to infinity ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update 2024-11-14 00:08:55 +08:00			`# For Elasticsearch, the chunk is not searchable in shot time (~2s).`
Integration with Infinity (#2894) ### What problem does this PR solve? Integration with Infinity - Replaced ELASTICSEARCH with dataStoreConn - Renamed deleteByQuery with delete - Renamed bulk to upsertBulk - getHighlight, getAggregation - Fix KGSearch.search - Moved Dealer.sql_retrieval to es_conn.py ### Type of change - [x] Refactoring 2024-11-12 14:59:41 +08:00			`sleep(3)`
			`chunk.update({"available":0})`
Add test for API (#3134) ### What problem does this PR solve? Add test for API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> 2024-11-01 22:59:17 +08:00

			`def test_retrieve_chunks(get_api_key_fixture):`
			`API_KEY = get_api_key_fixture`
			`rag = RAGFlow(API_KEY, HOST_ADDRESS)`
			`ds = rag.create_dataset(name="retrieval")`
			`with open("test_data/ragflow_test.txt", "rb") as file:`
			`blob = file.read()`
			`'''`
			`# chunk_size = 1024 * 1024`
			`# chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]`
			`documents = [`
			`{'displayed_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)`
			`]`
			`'''`
			`documents =[{"displayed_name":"test_retrieve_chunks.txt","blob":blob}]`
			`docs = ds.upload_documents(documents)`
			`doc = docs[0]`
			`doc.add_chunk(content="This is a chunk addition test")`
			`rag.retrieve(dataset_ids=[ds.id],document_ids=[doc.id])`