John 3843af666e
feat: Enable remote chunking via unstructured-ingest (#2905)
Update: The cli shell script works when sending documents to the free
api, but the paid api is down, so waiting to test against it.

- The first commit adds docstrings and fixes type hints.
- The second commit reorganizes `test_unstructured_ingest` so it matches
the structure of `unstructured/ingest`.
- The third commit contains the primary changes for this PR.
- The `.chunk()` method responsible for sending elements to the correct
method is moved from `ChunkingConfig` to `Chunker` so that
`ChunkingConfig` acts as a config object instead of containing
implementation logic. `Chunker.chunk()` also now takes a json file
instead of a list of elements. This is done to avoid redundant
serialization if the file is to be sent to the api for chunking.

---------

Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
2024-04-25 00:24:58 +00:00

60 lines
2.2 KiB
Python

from datetime import datetime
from unittest.mock import MagicMock
import pytest
from unstructured.ingest.connector.sharepoint import SharepointIngestDoc
from unstructured.ingest.interfaces import ProcessorConfig, ReadConfig
@pytest.mark.parametrize(
("time_created", "time_last_modified", "expected_created", "expected_modified"),
[
(
"2023-06-16T05:05:05+00:00",
datetime(2023, 6, 16, 5, 5, 5),
"2023-06-16T05:05:05+00:00",
"2023-06-16T05:05:05",
),
("2023-06-16 05:05:05", "2023-06-16", "2023-06-16T05:05:05", "2023-06-16T00:00:00"),
# Add more pairs of input strings and their expected ISO format results here
],
)
def test_datetime_handling_in_update_source_metadata(
mocker, time_created, time_last_modified, expected_created, expected_modified
):
"""Test the handling of various datetime formats in update_source_metadata."""
# Create a mock SharePoint response directly in the test
mock_sharepoint_response = mocker.MagicMock()
mock_sharepoint_response.time_created = time_created
mock_sharepoint_response.time_last_modified = time_last_modified
# Patch the SharePoint interaction methods to use the mock response
mocker.patch(
"unstructured.ingest.connector.sharepoint.SharepointIngestDoc._fetch_file",
return_value=mock_sharepoint_response,
)
mocker.patch(
"unstructured.ingest.connector.sharepoint.SharepointIngestDoc._fetch_page",
return_value=None,
)
# Instantiate your document with dummy data
ingest_doc = SharepointIngestDoc(
connector_config=MagicMock(),
site_url="dummy_url",
server_path="dummy_path",
is_page=False,
file_path="dummy_path.html",
processor_config=ProcessorConfig(),
read_config=ReadConfig(),
)
# Execute the method under test
ingest_doc.update_source_metadata()
# Assertions to verify the datetime handling against expected results
assert ingest_doc.source_metadata is not None
assert ingest_doc.source_metadata.date_created.startswith(expected_created)
assert ingest_doc.source_metadata.date_modified.startswith(expected_modified)