mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-27 10:16:31 +00:00
roman/fsspec compression support (#1730)
### Description Opened to replace original PR: [1443](https://github.com/Unstructured-IO/unstructured/pull/1443)
This commit is contained in:
parent
282b8f700d
commit
9c7ee8921a
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.10.24-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* **Ingest compression utilities and fsspec connector support** Generic utility code added to handle files that get pulled from a source connector that are either tar or zip compressed and uncompress them locally. This is then processed using a local source connector. Currently this functionality has been incorporated into the fsspec connector and all those inheriting from it (currently: Azure Blob Storage, Google Cloud Storage, S3, Box, and Dropbox).
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.10.23
|
## 0.10.23
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
@ -1505,4 +1515,4 @@ of an email.
|
|||||||
|
|
||||||
## 0.2.0
|
## 0.2.0
|
||||||
|
|
||||||
* Initial release of unstructured
|
* Initial release of unstructured
|
37
test_unstructured_ingest/test-ingest-s3-compression.sh
Executable file
37
test_unstructured_ingest/test-ingest-s3-compression.sh
Executable file
@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||||
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
|
OUTPUT_FOLDER_NAME=s3-compression
|
||||||
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||||
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
function cleanup() {
|
||||||
|
cleanup_dir "$OUTPUT_DIR"
|
||||||
|
cleanup_dir "$WORK_DIR"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
|
s3 \
|
||||||
|
--num-processes "$max_processes" \
|
||||||
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
|
--strategy fast \
|
||||||
|
--preserve-downloads \
|
||||||
|
--reprocess \
|
||||||
|
--output-dir "$OUTPUT_DIR" \
|
||||||
|
--verbose \
|
||||||
|
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set-w-compression/ \
|
||||||
|
--anonymous \
|
||||||
|
--work-dir "$WORK_DIR" \
|
||||||
|
--uncompress
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
|
@ -1 +1 @@
|
|||||||
__version__ = "0.10.23" # pragma: no cover
|
__version__ = "0.10.24-dev0" # pragma: no cover
|
||||||
|
@ -8,12 +8,11 @@ from unstructured.ingest.cli.common import (
|
|||||||
log_options,
|
log_options,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.interfaces import (
|
from unstructured.ingest.cli.interfaces import (
|
||||||
|
CliFilesStorageConfig,
|
||||||
CliMixin,
|
CliMixin,
|
||||||
CliRecursiveConfig,
|
|
||||||
CliRemoteUrlConfig,
|
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||||
from unstructured.ingest.interfaces import BaseConfig
|
from unstructured.ingest.interfaces import BaseConfig, FsspecConfig
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner import AzureRunner
|
from unstructured.ingest.runner import AzureRunner
|
||||||
|
|
||||||
@ -58,7 +57,11 @@ def azure_source(ctx: click.Context, **options):
|
|||||||
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
configs = extract_configs(options, validate=[AzureCliConfig])
|
configs = extract_configs(
|
||||||
|
options,
|
||||||
|
validate=[AzureCliConfig],
|
||||||
|
extras={"fsspec_config": FsspecConfig},
|
||||||
|
)
|
||||||
runner = AzureRunner(
|
runner = AzureRunner(
|
||||||
**configs, # type: ignore
|
**configs, # type: ignore
|
||||||
)
|
)
|
||||||
@ -70,5 +73,5 @@ def azure_source(ctx: click.Context, **options):
|
|||||||
|
|
||||||
def get_source_cmd() -> click.Group:
|
def get_source_cmd() -> click.Group:
|
||||||
cmd = azure_source
|
cmd = azure_source
|
||||||
add_options(cmd, extras=[AzureCliConfig, CliRemoteUrlConfig, CliRecursiveConfig])
|
add_options(cmd, extras=[AzureCliConfig, CliFilesStorageConfig])
|
||||||
return cmd
|
return cmd
|
||||||
|
@ -70,7 +70,7 @@ def azure_cognitive_search_dest(ctx: click.Context, **options):
|
|||||||
configs = extract_configs(options, validate=[AzureCognitiveSearchCliWriteConfig])
|
configs = extract_configs(options, validate=[AzureCognitiveSearchCliWriteConfig])
|
||||||
runner_cls = runner_map[source_cmd]
|
runner_cls = runner_map[source_cmd]
|
||||||
runner = runner_cls(
|
runner = runner_cls(
|
||||||
**configs,
|
**configs, # type: ignore
|
||||||
writer_type="azure_cognitive_search",
|
writer_type="azure_cognitive_search",
|
||||||
writer_kwargs=options,
|
writer_kwargs=options,
|
||||||
)
|
)
|
||||||
|
@ -8,12 +8,11 @@ from unstructured.ingest.cli.common import (
|
|||||||
log_options,
|
log_options,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.interfaces import (
|
from unstructured.ingest.cli.interfaces import (
|
||||||
|
CliFilesStorageConfig,
|
||||||
CliMixin,
|
CliMixin,
|
||||||
CliRecursiveConfig,
|
|
||||||
CliRemoteUrlConfig,
|
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||||
from unstructured.ingest.interfaces import BaseConfig
|
from unstructured.ingest.interfaces import BaseConfig, FsspecConfig
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner import BoxRunner
|
from unstructured.ingest.runner import BoxRunner
|
||||||
|
|
||||||
@ -45,7 +44,11 @@ def box_source(ctx: click.Context, **options):
|
|||||||
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
configs = extract_configs(options, validate=[BoxCliConfig])
|
configs = extract_configs(
|
||||||
|
options,
|
||||||
|
validate=[BoxCliConfig],
|
||||||
|
extras={"fsspec_config": FsspecConfig},
|
||||||
|
)
|
||||||
runner = BoxRunner(
|
runner = BoxRunner(
|
||||||
**configs, # type: ignore
|
**configs, # type: ignore
|
||||||
)
|
)
|
||||||
@ -57,5 +60,5 @@ def box_source(ctx: click.Context, **options):
|
|||||||
|
|
||||||
def get_source_cmd() -> click.Group:
|
def get_source_cmd() -> click.Group:
|
||||||
cmd = box_source
|
cmd = box_source
|
||||||
add_options(cmd, extras=[BoxCliConfig, CliRemoteUrlConfig, CliRecursiveConfig])
|
add_options(cmd, extras=[BoxCliConfig, CliFilesStorageConfig])
|
||||||
return cmd
|
return cmd
|
||||||
|
@ -123,7 +123,7 @@ def delta_table_dest(ctx: click.Context, **options):
|
|||||||
DeltaTableCliWriteConfig.from_dict(options)
|
DeltaTableCliWriteConfig.from_dict(options)
|
||||||
runner_cls = runner_map[source_cmd]
|
runner_cls = runner_map[source_cmd]
|
||||||
runner = runner_cls(
|
runner = runner_cls(
|
||||||
**configs,
|
**configs, # type: ignore
|
||||||
writer_type="delta_table",
|
writer_type="delta_table",
|
||||||
writer_kwargs=options,
|
writer_kwargs=options,
|
||||||
)
|
)
|
||||||
|
@ -7,12 +7,11 @@ from unstructured.ingest.cli.common import (
|
|||||||
log_options,
|
log_options,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.interfaces import (
|
from unstructured.ingest.cli.interfaces import (
|
||||||
|
CliFilesStorageConfig,
|
||||||
CliMixin,
|
CliMixin,
|
||||||
CliRecursiveConfig,
|
|
||||||
CliRemoteUrlConfig,
|
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||||
from unstructured.ingest.interfaces import BaseConfig
|
from unstructured.ingest.interfaces import BaseConfig, FsspecConfig
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner import DropboxRunner
|
from unstructured.ingest.runner import DropboxRunner
|
||||||
|
|
||||||
@ -44,7 +43,11 @@ def dropbox_source(ctx: click.Context, **options):
|
|||||||
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
configs = extract_configs(options, validate=[DropboxCliConfig])
|
configs = extract_configs(
|
||||||
|
options,
|
||||||
|
validate=[DropboxCliConfig],
|
||||||
|
extras={"fsspec_config": FsspecConfig},
|
||||||
|
)
|
||||||
runner = DropboxRunner(
|
runner = DropboxRunner(
|
||||||
**configs, # type: ignore
|
**configs, # type: ignore
|
||||||
)
|
)
|
||||||
@ -56,5 +59,5 @@ def dropbox_source(ctx: click.Context, **options):
|
|||||||
|
|
||||||
def get_source_cmd() -> click.Group:
|
def get_source_cmd() -> click.Group:
|
||||||
cmd = dropbox_source
|
cmd = dropbox_source
|
||||||
add_options(cmd, extras=[DropboxCliConfig, CliRemoteUrlConfig, CliRecursiveConfig])
|
add_options(cmd, extras=[DropboxCliConfig, CliFilesStorageConfig])
|
||||||
return cmd
|
return cmd
|
||||||
|
@ -6,10 +6,10 @@ from unstructured.ingest.cli.common import (
|
|||||||
log_options,
|
log_options,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.interfaces import (
|
from unstructured.ingest.cli.interfaces import (
|
||||||
CliRecursiveConfig,
|
CliFilesStorageConfig,
|
||||||
CliRemoteUrlConfig,
|
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||||
|
from unstructured.ingest.interfaces import FsspecConfig
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner import FsspecRunner
|
from unstructured.ingest.runner import FsspecRunner
|
||||||
|
|
||||||
@ -25,7 +25,7 @@ def fsspec_source(ctx: click.Context, **options):
|
|||||||
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
configs = extract_configs(options)
|
configs = extract_configs(options, extras={"fsspec_config": FsspecConfig})
|
||||||
runner = FsspecRunner(
|
runner = FsspecRunner(
|
||||||
**configs, # type: ignore
|
**configs, # type: ignore
|
||||||
)
|
)
|
||||||
@ -37,5 +37,5 @@ def fsspec_source(ctx: click.Context, **options):
|
|||||||
|
|
||||||
def get_source_cmd() -> click.Group:
|
def get_source_cmd() -> click.Group:
|
||||||
cmd = fsspec_source
|
cmd = fsspec_source
|
||||||
add_options(cmd, extras=[CliRemoteUrlConfig, CliRecursiveConfig])
|
add_options(cmd, extras=[CliFilesStorageConfig])
|
||||||
return cmd
|
return cmd
|
||||||
|
@ -8,12 +8,11 @@ from unstructured.ingest.cli.common import (
|
|||||||
log_options,
|
log_options,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.interfaces import (
|
from unstructured.ingest.cli.interfaces import (
|
||||||
|
CliFilesStorageConfig,
|
||||||
CliMixin,
|
CliMixin,
|
||||||
CliRecursiveConfig,
|
|
||||||
CliRemoteUrlConfig,
|
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||||
from unstructured.ingest.interfaces import BaseConfig
|
from unstructured.ingest.interfaces import BaseConfig, FsspecConfig
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner import GCSRunner
|
from unstructured.ingest.runner import GCSRunner
|
||||||
|
|
||||||
@ -47,7 +46,11 @@ def gcs_source(ctx: click.Context, **options):
|
|||||||
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
configs = extract_configs(options, validate=([GcsCliConfig]))
|
configs = extract_configs(
|
||||||
|
options,
|
||||||
|
validate=([GcsCliConfig]),
|
||||||
|
extras={"fsspec_config": FsspecConfig},
|
||||||
|
)
|
||||||
runner = GCSRunner(
|
runner = GCSRunner(
|
||||||
**configs, # type: ignore
|
**configs, # type: ignore
|
||||||
)
|
)
|
||||||
@ -59,5 +62,5 @@ def gcs_source(ctx: click.Context, **options):
|
|||||||
|
|
||||||
def get_source_cmd() -> click.Group:
|
def get_source_cmd() -> click.Group:
|
||||||
cmd = gcs_source
|
cmd = gcs_source
|
||||||
add_options(cmd, extras=[GcsCliConfig, CliRemoteUrlConfig, CliRecursiveConfig])
|
add_options(cmd, extras=[GcsCliConfig, CliFilesStorageConfig])
|
||||||
return cmd
|
return cmd
|
||||||
|
@ -8,14 +8,13 @@ from unstructured.ingest.cli.common import (
|
|||||||
log_options,
|
log_options,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.interfaces import (
|
from unstructured.ingest.cli.interfaces import (
|
||||||
|
CliFilesStorageConfig,
|
||||||
CliMixin,
|
CliMixin,
|
||||||
CliRecursiveConfig,
|
|
||||||
CliRemoteUrlConfig,
|
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||||
from unstructured.ingest.interfaces import BaseConfig
|
from unstructured.ingest.interfaces import BaseConfig, FsspecConfig
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner import S3Runner, runner_map
|
from unstructured.ingest.runner import FsspecBaseRunner, S3Runner, runner_map
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -57,7 +56,11 @@ def s3_source(ctx: click.Context, **options):
|
|||||||
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
configs = extract_configs(options, validate=[S3CliConfig])
|
configs = extract_configs(
|
||||||
|
options,
|
||||||
|
validate=[S3CliConfig],
|
||||||
|
extras={"fsspec_config": FsspecConfig},
|
||||||
|
)
|
||||||
s3_runner = S3Runner(
|
s3_runner = S3Runner(
|
||||||
**configs, # type: ignore
|
**configs, # type: ignore
|
||||||
)
|
)
|
||||||
@ -82,10 +85,16 @@ def s3_dest(ctx: click.Context, **options):
|
|||||||
log_options(parent_options, verbose=verbose)
|
log_options(parent_options, verbose=verbose)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
configs = extract_configs(options, validate=[S3CliConfig])
|
|
||||||
runner_cls = runner_map[source_cmd]
|
runner_cls = runner_map[source_cmd]
|
||||||
|
configs = extract_configs(
|
||||||
|
options,
|
||||||
|
validate=[S3CliConfig],
|
||||||
|
extras={"fsspec_config": FsspecConfig}
|
||||||
|
if issubclass(runner_cls, FsspecBaseRunner)
|
||||||
|
else None,
|
||||||
|
)
|
||||||
runner = runner_cls(
|
runner = runner_cls(
|
||||||
**configs,
|
**configs, # type: ignore
|
||||||
writer_type="s3",
|
writer_type="s3",
|
||||||
writer_kwargs=options,
|
writer_kwargs=options,
|
||||||
)
|
)
|
||||||
@ -100,11 +109,11 @@ def s3_dest(ctx: click.Context, **options):
|
|||||||
def get_dest_cmd() -> click.Command:
|
def get_dest_cmd() -> click.Command:
|
||||||
cmd = s3_dest
|
cmd = s3_dest
|
||||||
S3CliConfig.add_cli_options(cmd)
|
S3CliConfig.add_cli_options(cmd)
|
||||||
CliRemoteUrlConfig.add_cli_options(cmd)
|
CliFilesStorageConfig.add_cli_options(cmd)
|
||||||
return cmd
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
def get_source_cmd() -> click.Group:
|
def get_source_cmd() -> click.Group:
|
||||||
cmd = s3_source
|
cmd = s3_source
|
||||||
add_options(cmd, extras=[S3CliConfig, CliRemoteUrlConfig, CliRecursiveConfig])
|
add_options(cmd, extras=[S3CliConfig, CliFilesStorageConfig])
|
||||||
return cmd
|
return cmd
|
||||||
|
@ -10,6 +10,7 @@ from unstructured.ingest.interfaces import (
|
|||||||
BaseConfig,
|
BaseConfig,
|
||||||
ChunkingConfig,
|
ChunkingConfig,
|
||||||
EmbeddingConfig,
|
EmbeddingConfig,
|
||||||
|
FileStorageConfig,
|
||||||
PartitionConfig,
|
PartitionConfig,
|
||||||
PermissionsConfig,
|
PermissionsConfig,
|
||||||
ProcessorConfig,
|
ProcessorConfig,
|
||||||
@ -248,9 +249,7 @@ class CliRecursiveConfig(BaseConfig, CliMixin):
|
|||||||
cmd.params.extend(options)
|
cmd.params.extend(options)
|
||||||
|
|
||||||
|
|
||||||
class CliRemoteUrlConfig(BaseConfig, CliMixin):
|
class CliFilesStorageConfig(FileStorageConfig, CliMixin):
|
||||||
remote_url: str
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_cli_options(cmd: click.Command) -> None:
|
def add_cli_options(cmd: click.Command) -> None:
|
||||||
options = [
|
options = [
|
||||||
@ -259,6 +258,21 @@ class CliRemoteUrlConfig(BaseConfig, CliMixin):
|
|||||||
required=True,
|
required=True,
|
||||||
help="Remote fsspec URL formatted as `protocol://dir/path`",
|
help="Remote fsspec URL formatted as `protocol://dir/path`",
|
||||||
),
|
),
|
||||||
|
click.Option(
|
||||||
|
["--uncompress"],
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="Uncompress any archived files. Currently supporting zip and tar "
|
||||||
|
"files based on file extension.",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--recursive"],
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Recursively download files in their respective folders "
|
||||||
|
"otherwise stop at the files in provided folder level.",
|
||||||
|
),
|
||||||
]
|
]
|
||||||
cmd.params.extend(options)
|
cmd.params.extend(options)
|
||||||
|
|
||||||
|
@ -26,6 +26,7 @@ def conform_click_options(options: dict):
|
|||||||
|
|
||||||
def extract_configs(
|
def extract_configs(
|
||||||
data: dict,
|
data: dict,
|
||||||
|
extras: t.Optional[t.Dict[str, t.Type[BaseConfig]]] = None,
|
||||||
validate: t.Optional[t.List[t.Type[BaseConfig]]] = None,
|
validate: t.Optional[t.List[t.Type[BaseConfig]]] = None,
|
||||||
) -> t.Dict[str, BaseConfig]:
|
) -> t.Dict[str, BaseConfig]:
|
||||||
"""
|
"""
|
||||||
@ -42,6 +43,9 @@ def extract_configs(
|
|||||||
"processor_config": CliProcessorConfig.from_dict(data),
|
"processor_config": CliProcessorConfig.from_dict(data),
|
||||||
"permissions_config": CliPermissionsConfig.from_dict(data),
|
"permissions_config": CliPermissionsConfig.from_dict(data),
|
||||||
}
|
}
|
||||||
|
if extras:
|
||||||
|
for k, conf in extras.items():
|
||||||
|
res[k] = conf.from_dict(data)
|
||||||
for v in validate:
|
for v in validate:
|
||||||
v.from_dict(data)
|
v.from_dict(data)
|
||||||
return res
|
return res
|
||||||
|
105
unstructured/ingest/compression_support.py
Normal file
105
unstructured/ingest/compression_support.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import copy
|
||||||
|
import os
|
||||||
|
import tarfile
|
||||||
|
import zipfile
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from unstructured.ingest.connector.local import LocalSourceConnector, SimpleLocalConfig
|
||||||
|
from unstructured.ingest.interfaces import (
|
||||||
|
BaseConnectorConfig,
|
||||||
|
BaseIngestDoc,
|
||||||
|
ProcessorConfig,
|
||||||
|
ReadConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.logger import logger
|
||||||
|
|
||||||
|
ZIP_FILE_EXT = [".zip"]
|
||||||
|
TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
|
||||||
|
|
||||||
|
|
||||||
|
def uncompress_file(filename: str, path: Optional[str] = None) -> str:
|
||||||
|
"""
|
||||||
|
Takes in a compressed zip or tar file and uncompresses it
|
||||||
|
"""
|
||||||
|
# Create path if it doesn't already exist
|
||||||
|
if path:
|
||||||
|
Path(path).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
|
||||||
|
return uncompress_zip_file(zip_filename=filename, path=path)
|
||||||
|
elif any(filename.endswith(ext) for ext in TAR_FILE_EXT):
|
||||||
|
return uncompress_tar_file(tar_filename=filename, path=path)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"filename {} not a recognized compressed extension: {}".format(
|
||||||
|
filename,
|
||||||
|
", ".join(ZIP_FILE_EXT + TAR_FILE_EXT),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def uncompress_zip_file(zip_filename: str, path: Optional[str] = None) -> str:
|
||||||
|
head, tail = os.path.split(zip_filename)
|
||||||
|
for ext in ZIP_FILE_EXT:
|
||||||
|
if tail.endswith(ext):
|
||||||
|
tail = tail[: -(len(ext))]
|
||||||
|
break
|
||||||
|
path = path if path else os.path.join(head, f"{tail}-zip-uncompressed")
|
||||||
|
logger.info(f"extracting zip {zip_filename} -> {path}")
|
||||||
|
with zipfile.ZipFile(zip_filename) as zfile:
|
||||||
|
zfile.extractall(path=path)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
|
||||||
|
head, tail = os.path.split(tar_filename)
|
||||||
|
for ext in TAR_FILE_EXT:
|
||||||
|
if tail.endswith(ext):
|
||||||
|
tail = tail[: -(len(ext))]
|
||||||
|
break
|
||||||
|
|
||||||
|
path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
|
||||||
|
logger.info(f"extracting tar {tar_filename} -> {path}")
|
||||||
|
with tarfile.open(tar_filename, "r:gz") as tfile:
|
||||||
|
tfile.extractall(path=path)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CompressionSourceConnectorMixin:
|
||||||
|
processor_config: ProcessorConfig
|
||||||
|
read_config: ReadConfig
|
||||||
|
connector_config: BaseConnectorConfig
|
||||||
|
|
||||||
|
def process_compressed_doc(self, doc: BaseIngestDoc) -> List[BaseIngestDoc]:
|
||||||
|
"""
|
||||||
|
Utility function which helps process compressed files. Extracts the contents and returns
|
||||||
|
generated ingest docs via local source connector
|
||||||
|
"""
|
||||||
|
# Download the raw file to local
|
||||||
|
doc.get_file()
|
||||||
|
path = uncompress_file(filename=str(doc.filename))
|
||||||
|
new_read_configs = copy.copy(self.read_config)
|
||||||
|
new_process_configs = copy.copy(self.processor_config)
|
||||||
|
relative_path = path.replace(self.read_config.download_dir, "")
|
||||||
|
|
||||||
|
if self.processor_config.output_dir.endswith(os.sep):
|
||||||
|
new_process_configs.output_dir = f"{self.processor_config.output_dir}{relative_path}"
|
||||||
|
else:
|
||||||
|
new_process_configs.output_dir = (
|
||||||
|
f"{self.processor_config.output_dir}{os.sep}{relative_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
local_connector = LocalSourceConnector(
|
||||||
|
connector_config=SimpleLocalConfig(
|
||||||
|
input_path=path,
|
||||||
|
recursive=True,
|
||||||
|
),
|
||||||
|
read_config=new_read_configs,
|
||||||
|
processor_config=new_process_configs,
|
||||||
|
)
|
||||||
|
logger.info(f"Created local source connector: {local_connector.to_json()}")
|
||||||
|
local_connector.initialize()
|
||||||
|
return local_connector.get_ingest_docs()
|
@ -100,7 +100,7 @@ class DropboxSourceConnector(FsspecSourceConnector):
|
|||||||
return
|
return
|
||||||
elif ls_output:
|
elif ls_output:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"No objects found in {self.connector_config.path}.",
|
f"No objects found in {self.connector_config.remote_url}.",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise MissingFolderError(
|
raise MissingFolderError(
|
||||||
|
@ -1,16 +1,21 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import typing as t
|
import typing as t
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
|
|
||||||
|
from unstructured.ingest.compression_support import (
|
||||||
|
TAR_FILE_EXT,
|
||||||
|
ZIP_FILE_EXT,
|
||||||
|
CompressionSourceConnectorMixin,
|
||||||
|
)
|
||||||
from unstructured.ingest.error import SourceConnectionError
|
from unstructured.ingest.error import SourceConnectionError
|
||||||
from unstructured.ingest.interfaces import (
|
from unstructured.ingest.interfaces import (
|
||||||
BaseConnectorConfig,
|
BaseConnectorConfig,
|
||||||
BaseDestinationConnector,
|
BaseDestinationConnector,
|
||||||
BaseIngestDoc,
|
BaseIngestDoc,
|
||||||
BaseSourceConnector,
|
BaseSourceConnector,
|
||||||
|
FsspecConfig,
|
||||||
IngestDocCleanupMixin,
|
IngestDocCleanupMixin,
|
||||||
SourceConnectorCleanupMixin,
|
SourceConnectorCleanupMixin,
|
||||||
SourceMetadata,
|
SourceMetadata,
|
||||||
@ -33,49 +38,8 @@ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class SimpleFsspecConfig(BaseConnectorConfig):
|
class SimpleFsspecConfig(FsspecConfig, BaseConnectorConfig):
|
||||||
# fsspec specific options
|
pass
|
||||||
path: str
|
|
||||||
recursive: bool = False
|
|
||||||
access_kwargs: dict = field(default_factory=dict)
|
|
||||||
protocol: str = field(init=False)
|
|
||||||
path_without_protocol: str = field(init=False)
|
|
||||||
dir_path: str = field(init=False)
|
|
||||||
file_path: str = field(init=False)
|
|
||||||
|
|
||||||
def get_access_kwargs(self) -> dict:
|
|
||||||
return self.access_kwargs
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
self.protocol, self.path_without_protocol = self.path.split("://")
|
|
||||||
if self.protocol not in SUPPORTED_REMOTE_FSSPEC_PROTOCOLS:
|
|
||||||
raise ValueError(
|
|
||||||
f"Protocol {self.protocol} not supported yet, only "
|
|
||||||
f"{SUPPORTED_REMOTE_FSSPEC_PROTOCOLS} are supported.",
|
|
||||||
)
|
|
||||||
|
|
||||||
# dropbox root is an empty string
|
|
||||||
match = re.match(rf"{self.protocol}://([\s])/", self.path)
|
|
||||||
if match and self.protocol == "dropbox":
|
|
||||||
self.dir_path = " "
|
|
||||||
self.file_path = ""
|
|
||||||
return
|
|
||||||
|
|
||||||
# just a path with no trailing prefix
|
|
||||||
match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.path)
|
|
||||||
if match:
|
|
||||||
self.dir_path = match.group(1)
|
|
||||||
self.file_path = ""
|
|
||||||
return
|
|
||||||
|
|
||||||
# valid path with a dir and/or file
|
|
||||||
match = re.match(rf"{self.protocol}://([^/\s]+?)/([^\s]*)", self.path)
|
|
||||||
if not match:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid path {self.path}. Expected <protocol>://<dir-path>/<file-or-dir-path>.",
|
|
||||||
)
|
|
||||||
self.dir_path = match.group(1)
|
|
||||||
self.file_path = match.group(2) or ""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -167,7 +131,11 @@ class FsspecIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FsspecSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
class FsspecSourceConnector(
|
||||||
|
SourceConnectorCleanupMixin,
|
||||||
|
CompressionSourceConnectorMixin,
|
||||||
|
BaseSourceConnector,
|
||||||
|
):
|
||||||
"""Objects of this class support fetching document(s) from"""
|
"""Objects of this class support fetching document(s) from"""
|
||||||
|
|
||||||
connector_config: SimpleFsspecConfig
|
connector_config: SimpleFsspecConfig
|
||||||
@ -186,7 +154,7 @@ class FsspecSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|||||||
ls_output = self.fs.ls(self.connector_config.path_without_protocol)
|
ls_output = self.fs.ls(self.connector_config.path_without_protocol)
|
||||||
if len(ls_output) < 1:
|
if len(ls_output) < 1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"No objects found in {self.connector_config.path}.",
|
f"No objects found in {self.connector_config.remote_url}.",
|
||||||
)
|
)
|
||||||
|
|
||||||
def _list_files(self):
|
def _list_files(self):
|
||||||
@ -212,15 +180,44 @@ class FsspecSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def get_ingest_docs(self):
|
def get_ingest_docs(self):
|
||||||
return [
|
files = self._list_files()
|
||||||
self.ingest_doc_cls(
|
# remove compressed files
|
||||||
processor_config=self.processor_config,
|
compressed_file_ext = TAR_FILE_EXT + ZIP_FILE_EXT
|
||||||
|
compressed_files = []
|
||||||
|
uncompressed_files = []
|
||||||
|
docs: t.List[BaseIngestDoc] = []
|
||||||
|
for file in files:
|
||||||
|
if any(file.endswith(ext) for ext in compressed_file_ext):
|
||||||
|
compressed_files.append(file)
|
||||||
|
else:
|
||||||
|
uncompressed_files.append(file)
|
||||||
|
docs.extend(
|
||||||
|
[
|
||||||
|
self.ingest_doc_cls(
|
||||||
|
read_config=self.read_config,
|
||||||
|
connector_config=self.connector_config,
|
||||||
|
processor_config=self.processor_config,
|
||||||
|
remote_file_path=file,
|
||||||
|
)
|
||||||
|
for file in uncompressed_files
|
||||||
|
],
|
||||||
|
)
|
||||||
|
if not self.connector_config.uncompress:
|
||||||
|
return docs
|
||||||
|
for compressed_file in compressed_files:
|
||||||
|
compressed_doc = self.ingest_doc_cls(
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
|
processor_config=self.processor_config,
|
||||||
connector_config=self.connector_config,
|
connector_config=self.connector_config,
|
||||||
remote_file_path=file,
|
remote_file_path=compressed_file,
|
||||||
)
|
)
|
||||||
for file in self._list_files()
|
try:
|
||||||
]
|
local_ingest_docs = self.process_compressed_doc(doc=compressed_doc)
|
||||||
|
logger.info(f"adding {len(local_ingest_docs)} from {compressed_file}")
|
||||||
|
docs.extend(local_ingest_docs)
|
||||||
|
finally:
|
||||||
|
compressed_doc.cleanup_file()
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -245,7 +242,7 @@ class FsspecDestinationConnector(BaseDestinationConnector):
|
|||||||
|
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
s3_file_path = doc.base_filename
|
s3_file_path = doc.base_filename
|
||||||
s3_folder = self.connector_config.path
|
s3_folder = self.connector_config.remote_url
|
||||||
|
|
||||||
s3_output_path = str(PurePath(s3_folder, s3_file_path)) if s3_file_path else s3_folder
|
s3_output_path = str(PurePath(s3_folder, s3_file_path)) if s3_file_path else s3_folder
|
||||||
logger.debug(f"Uploading {doc._output_filename} -> {s3_output_path}")
|
logger.debug(f"Uploading {doc._output_filename} -> {s3_output_path}")
|
||||||
|
@ -57,7 +57,6 @@ class GitIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
|||||||
@SourceConnectionError.wrap
|
@SourceConnectionError.wrap
|
||||||
@BaseIngestDoc.skip_if_file_exists
|
@BaseIngestDoc.skip_if_file_exists
|
||||||
def get_file(self):
|
def get_file(self):
|
||||||
print(self)
|
|
||||||
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
||||||
self._create_full_tmp_dir_path()
|
self._create_full_tmp_dir_path()
|
||||||
logger.debug(f"Fetching {self} - PID: {os.getpid()}")
|
logger.debug(f"Fetching {self} - PID: {os.getpid()}")
|
||||||
|
@ -70,7 +70,9 @@ class LocalSourceConnector(BaseSourceConnector):
|
|||||||
"""Objects of this class support fetching document(s) from local file system"""
|
"""Objects of this class support fetching document(s) from local file system"""
|
||||||
|
|
||||||
connector_config: SimpleLocalConfig
|
connector_config: SimpleLocalConfig
|
||||||
ingest_doc_cls: t.Type[LocalIngestDoc] = LocalIngestDoc
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.ingest_doc_cls: t.Type[LocalIngestDoc] = LocalIngestDoc
|
||||||
|
|
||||||
def cleanup(self, cur_dir=None):
|
def cleanup(self, cur_dir=None):
|
||||||
"""Not applicable to local file system"""
|
"""Not applicable to local file system"""
|
||||||
|
@ -4,6 +4,7 @@ through Unstructured."""
|
|||||||
import functools
|
import functools
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import typing as t
|
import typing as t
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@ -23,6 +24,17 @@ from unstructured.ingest.logger import logger
|
|||||||
from unstructured.partition.auto import partition
|
from unstructured.partition.auto import partition
|
||||||
from unstructured.staging.base import convert_to_dict, elements_from_json
|
from unstructured.staging.base import convert_to_dict, elements_from_json
|
||||||
|
|
||||||
|
SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
|
||||||
|
"s3",
|
||||||
|
"s3a",
|
||||||
|
"abfs",
|
||||||
|
"az",
|
||||||
|
"gs",
|
||||||
|
"gcs",
|
||||||
|
"box",
|
||||||
|
"dropbox",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BaseSessionHandle(ABC):
|
class BaseSessionHandle(ABC):
|
||||||
@ -63,6 +75,57 @@ class ProcessorConfig(BaseConfig):
|
|||||||
raise_on_error: bool = False
|
raise_on_error: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FileStorageConfig(BaseConfig):
|
||||||
|
remote_url: str
|
||||||
|
uncompress: bool = False
|
||||||
|
recursive: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FsspecConfig(FileStorageConfig):
|
||||||
|
access_kwargs: dict = field(default_factory=dict)
|
||||||
|
protocol: str = field(init=False)
|
||||||
|
path_without_protocol: str = field(init=False)
|
||||||
|
dir_path: str = field(init=False)
|
||||||
|
file_path: str = field(init=False)
|
||||||
|
|
||||||
|
def get_access_kwargs(self) -> dict:
|
||||||
|
return self.access_kwargs
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.protocol, self.path_without_protocol = self.remote_url.split("://")
|
||||||
|
if self.protocol not in SUPPORTED_REMOTE_FSSPEC_PROTOCOLS:
|
||||||
|
raise ValueError(
|
||||||
|
f"Protocol {self.protocol} not supported yet, only "
|
||||||
|
f"{SUPPORTED_REMOTE_FSSPEC_PROTOCOLS} are supported.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# dropbox root is an empty string
|
||||||
|
match = re.match(rf"{self.protocol}://([\s])/", self.remote_url)
|
||||||
|
if match and self.protocol == "dropbox":
|
||||||
|
self.dir_path = " "
|
||||||
|
self.file_path = ""
|
||||||
|
return
|
||||||
|
|
||||||
|
# just a path with no trailing prefix
|
||||||
|
match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.remote_url)
|
||||||
|
if match:
|
||||||
|
self.dir_path = match.group(1)
|
||||||
|
self.file_path = ""
|
||||||
|
return
|
||||||
|
|
||||||
|
# valid path with a dir and/or file
|
||||||
|
match = re.match(rf"{self.protocol}://([^/\s]+?)/([^\s]*)", self.remote_url)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid path {self.remote_url}. "
|
||||||
|
f"Expected <protocol>://<dir-path>/<file-or-dir-path>.",
|
||||||
|
)
|
||||||
|
self.dir_path = match.group(1)
|
||||||
|
self.file_path = match.group(2) or ""
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ReadConfig(BaseConfig):
|
class ReadConfig(BaseConfig):
|
||||||
# where raw documents are stored for processing, and then removed if not preserve_downloads
|
# where raw documents are stored for processing, and then removed if not preserve_downloads
|
||||||
|
@ -33,7 +33,7 @@ class Embedder(ReformatNode):
|
|||||||
filename_ext = os.path.basename(elements_json_filename)
|
filename_ext = os.path.basename(elements_json_filename)
|
||||||
filename = os.path.splitext(filename_ext)[0]
|
filename = os.path.splitext(filename_ext)[0]
|
||||||
hashed_filename = hashlib.sha256(
|
hashed_filename = hashlib.sha256(
|
||||||
f"{self.create_hash()}{filename}".encode()
|
f"{self.create_hash()}{filename}".encode(),
|
||||||
).hexdigest()[:32]
|
).hexdigest()[:32]
|
||||||
json_filename = f"{hashed_filename}.json"
|
json_filename = f"{hashed_filename}.json"
|
||||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
import typing as t
|
import typing as t
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
from .airtable import AirtableRunner
|
from .airtable import AirtableRunner
|
||||||
from .azure import AzureRunner
|
from .azure import AzureRunner
|
||||||
|
from .base_runner import FsspecBaseRunner, Runner
|
||||||
from .biomed import BiomedRunner
|
from .biomed import BiomedRunner
|
||||||
from .box import BoxRunner
|
from .box import BoxRunner
|
||||||
from .confluence import ConfluenceRunner
|
from .confluence import ConfluenceRunner
|
||||||
@ -26,7 +28,7 @@ from .sharepoint import SharePointRunner
|
|||||||
from .slack import SlackRunner
|
from .slack import SlackRunner
|
||||||
from .wikipedia import WikipediaRunner
|
from .wikipedia import WikipediaRunner
|
||||||
|
|
||||||
runner_map: t.Dict[str, t.Callable] = {
|
runner_map: t.Dict[str, Type[Runner]] = {
|
||||||
"airtable": AirtableRunner,
|
"airtable": AirtableRunner,
|
||||||
"azure": AzureRunner,
|
"azure": AzureRunner,
|
||||||
"biomed": BiomedRunner,
|
"biomed": BiomedRunner,
|
||||||
@ -82,4 +84,6 @@ __all__ = [
|
|||||||
"SlackRunner",
|
"SlackRunner",
|
||||||
"WikipediaRunner",
|
"WikipediaRunner",
|
||||||
"runner_map",
|
"runner_map",
|
||||||
|
"Runner",
|
||||||
|
"FsspecBaseRunner",
|
||||||
]
|
]
|
||||||
|
@ -2,18 +2,16 @@ import logging
|
|||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner.base_runner import Runner
|
from unstructured.ingest.runner.base_runner import FsspecBaseRunner
|
||||||
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
||||||
|
|
||||||
|
|
||||||
class AzureRunner(Runner):
|
class AzureRunner(FsspecBaseRunner):
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
account_name: t.Optional[str],
|
account_name: t.Optional[str],
|
||||||
account_key: t.Optional[str],
|
account_key: t.Optional[str],
|
||||||
connection_string: t.Optional[str],
|
connection_string: t.Optional[str],
|
||||||
remote_url: str,
|
|
||||||
recursive: bool = False,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO)
|
||||||
@ -26,7 +24,7 @@ class AzureRunner(Runner):
|
|||||||
self.read_config.download_dir = update_download_dir_remote_url(
|
self.read_config.download_dir = update_download_dir_remote_url(
|
||||||
connector_name="azure",
|
connector_name="azure",
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
remote_url=remote_url,
|
remote_url=self.fsspec_config.remote_url, # type: ignore
|
||||||
logger=logger,
|
logger=logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -44,13 +42,13 @@ class AzureRunner(Runner):
|
|||||||
access_kwargs = {"connection_string": connection_string}
|
access_kwargs = {"connection_string": connection_string}
|
||||||
else:
|
else:
|
||||||
access_kwargs = {}
|
access_kwargs = {}
|
||||||
|
connector_config = SimpleAzureBlobStorageConfig.from_dict(
|
||||||
|
self.fsspec_config.to_dict(), # type: ignore
|
||||||
|
)
|
||||||
|
connector_config.access_kwargs = access_kwargs
|
||||||
source_doc_connector = AzureBlobStorageSourceConnector( # type: ignore
|
source_doc_connector = AzureBlobStorageSourceConnector( # type: ignore
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
connector_config=SimpleAzureBlobStorageConfig(
|
connector_config=connector_config,
|
||||||
path=remote_url,
|
|
||||||
recursive=recursive,
|
|
||||||
access_kwargs=access_kwargs,
|
|
||||||
),
|
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ from unstructured.ingest.interfaces import (
|
|||||||
BaseSourceConnector,
|
BaseSourceConnector,
|
||||||
ChunkingConfig,
|
ChunkingConfig,
|
||||||
EmbeddingConfig,
|
EmbeddingConfig,
|
||||||
|
FsspecConfig,
|
||||||
PartitionConfig,
|
PartitionConfig,
|
||||||
PermissionsConfig,
|
PermissionsConfig,
|
||||||
ProcessorConfig,
|
ProcessorConfig,
|
||||||
@ -60,3 +61,15 @@ class Runner(ABC):
|
|||||||
chunking_config=self.chunking_config,
|
chunking_config=self.chunking_config,
|
||||||
permissions_config=self.get_permissions_config(),
|
permissions_config=self.get_permissions_config(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FsspecBaseRunner(Runner):
|
||||||
|
# TODO make this field required when python3.8 no longer supported
|
||||||
|
# python3.8 dataclass doesn't support default values in child classes, but this
|
||||||
|
# fsspec_config should be required in this class.
|
||||||
|
fsspec_config: t.Optional[FsspecConfig] = None
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if self.fsspec_config is None:
|
||||||
|
raise ValueError("fsspec_config must exist")
|
||||||
|
@ -2,15 +2,13 @@ import logging
|
|||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner.base_runner import Runner
|
from unstructured.ingest.runner.base_runner import FsspecBaseRunner
|
||||||
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
||||||
|
|
||||||
|
|
||||||
class BoxRunner(Runner):
|
class BoxRunner(FsspecBaseRunner):
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
remote_url: str,
|
|
||||||
recursive: bool = False,
|
|
||||||
box_app_config: t.Optional[str] = None,
|
box_app_config: t.Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@ -19,19 +17,17 @@ class BoxRunner(Runner):
|
|||||||
self.read_config.download_dir = update_download_dir_remote_url(
|
self.read_config.download_dir = update_download_dir_remote_url(
|
||||||
connector_name="box",
|
connector_name="box",
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
remote_url=remote_url,
|
remote_url=self.fsspec_config.remote_url, # type: ignore
|
||||||
logger=logger,
|
logger=logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
from unstructured.ingest.connector.box import BoxSourceConnector, SimpleBoxConfig
|
from unstructured.ingest.connector.box import BoxSourceConnector, SimpleBoxConfig
|
||||||
|
|
||||||
|
connector_config = SimpleBoxConfig.from_dict(self.fsspec_config.to_dict()) # type: ignore
|
||||||
|
connector_config.access_kwargs = {"box_app_config": box_app_config}
|
||||||
source_doc_connector = BoxSourceConnector( # type: ignore
|
source_doc_connector = BoxSourceConnector( # type: ignore
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
connector_config=SimpleBoxConfig(
|
connector_config=connector_config,
|
||||||
path=remote_url,
|
|
||||||
recursive=recursive,
|
|
||||||
access_kwargs={"box_app_config": box_app_config},
|
|
||||||
),
|
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2,15 +2,13 @@ import logging
|
|||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner.base_runner import Runner
|
from unstructured.ingest.runner.base_runner import FsspecBaseRunner
|
||||||
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
||||||
|
|
||||||
|
|
||||||
class DropboxRunner(Runner):
|
class DropboxRunner(FsspecBaseRunner):
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
remote_url: str,
|
|
||||||
recursive: bool = False,
|
|
||||||
token: t.Optional[str] = None,
|
token: t.Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@ -19,7 +17,7 @@ class DropboxRunner(Runner):
|
|||||||
self.read_config.download_dir = update_download_dir_remote_url(
|
self.read_config.download_dir = update_download_dir_remote_url(
|
||||||
connector_name="dropbox",
|
connector_name="dropbox",
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
remote_url=remote_url,
|
remote_url=self.fsspec_config.remote_url, # type: ignore
|
||||||
logger=logger,
|
logger=logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -28,13 +26,13 @@ class DropboxRunner(Runner):
|
|||||||
SimpleDropboxConfig,
|
SimpleDropboxConfig,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
connector_config = SimpleDropboxConfig.from_dict(
|
||||||
|
self.fsspec_config.to_dict(), # type: ignore
|
||||||
|
)
|
||||||
|
connector_config.access_kwargs = {"token": token}
|
||||||
source_doc_connector = DropboxSourceConnector( # type: ignore
|
source_doc_connector = DropboxSourceConnector( # type: ignore
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
connector_config=SimpleDropboxConfig(
|
connector_config=connector_config,
|
||||||
path=remote_url,
|
|
||||||
recursive=recursive,
|
|
||||||
access_kwargs={"token": token},
|
|
||||||
),
|
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -3,15 +3,13 @@ import warnings
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner.base_runner import Runner
|
from unstructured.ingest.runner.base_runner import FsspecBaseRunner
|
||||||
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
||||||
|
|
||||||
|
|
||||||
class FsspecRunner(Runner):
|
class FsspecRunner(FsspecBaseRunner):
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
remote_url: str,
|
|
||||||
recursive: bool = False,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO)
|
||||||
@ -19,11 +17,11 @@ class FsspecRunner(Runner):
|
|||||||
self.read_config.download_dir = update_download_dir_remote_url(
|
self.read_config.download_dir = update_download_dir_remote_url(
|
||||||
connector_name="fsspec",
|
connector_name="fsspec",
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
remote_url=remote_url,
|
remote_url=self.fsspec_config.remote_url, # type: ignore
|
||||||
logger=logger,
|
logger=logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
protocol = urlparse(remote_url).scheme
|
protocol = urlparse(self.fsspec_config.remote_url).scheme # type: ignore
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
|
f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
|
||||||
" so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`,"
|
" so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`,"
|
||||||
@ -36,11 +34,11 @@ class FsspecRunner(Runner):
|
|||||||
SimpleFsspecConfig,
|
SimpleFsspecConfig,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
connector_config = SimpleFsspecConfig.from_dict(
|
||||||
|
self.fsspec_config.to_dict(), # type: ignore
|
||||||
|
)
|
||||||
source_doc_connector = FsspecSourceConnector( # type: ignore
|
source_doc_connector = FsspecSourceConnector( # type: ignore
|
||||||
connector_config=SimpleFsspecConfig(
|
connector_config=connector_config,
|
||||||
path=remote_url,
|
|
||||||
recursive=recursive,
|
|
||||||
),
|
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
)
|
)
|
||||||
|
@ -2,15 +2,13 @@ import logging
|
|||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner.base_runner import Runner
|
from unstructured.ingest.runner.base_runner import FsspecBaseRunner
|
||||||
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
||||||
|
|
||||||
|
|
||||||
class GCSRunner(Runner):
|
class GCSRunner(FsspecBaseRunner):
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
remote_url: str,
|
|
||||||
recursive: bool = False,
|
|
||||||
token: t.Optional[str] = None,
|
token: t.Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@ -19,18 +17,17 @@ class GCSRunner(Runner):
|
|||||||
self.read_config.download_dir = update_download_dir_remote_url(
|
self.read_config.download_dir = update_download_dir_remote_url(
|
||||||
connector_name="gcs",
|
connector_name="gcs",
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
remote_url=remote_url,
|
remote_url=self.fsspec_config.remote_url, # type: ignore
|
||||||
logger=logger,
|
logger=logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
from unstructured.ingest.connector.gcs import GcsSourceConnector, SimpleGcsConfig
|
from unstructured.ingest.connector.gcs import GcsSourceConnector, SimpleGcsConfig
|
||||||
|
|
||||||
|
connector_config = SimpleGcsConfig.from_dict(self.fsspec_config.to_dict()) # type: ignore
|
||||||
|
connector_config.access_kwargs = {"token": token}
|
||||||
|
|
||||||
source_doc_connector = GcsSourceConnector( # type: ignore
|
source_doc_connector = GcsSourceConnector( # type: ignore
|
||||||
connector_config=SimpleGcsConfig(
|
connector_config=connector_config,
|
||||||
path=remote_url,
|
|
||||||
recursive=recursive,
|
|
||||||
access_kwargs={"token": token},
|
|
||||||
),
|
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
)
|
)
|
||||||
|
@ -2,15 +2,13 @@ import logging
|
|||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner.base_runner import Runner
|
from unstructured.ingest.runner.base_runner import FsspecBaseRunner
|
||||||
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
||||||
|
|
||||||
|
|
||||||
class S3Runner(Runner):
|
class S3Runner(FsspecBaseRunner):
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
remote_url: str,
|
|
||||||
recursive: bool = False,
|
|
||||||
anonymous: bool = False,
|
anonymous: bool = False,
|
||||||
endpoint_url: t.Optional[str] = None,
|
endpoint_url: t.Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -20,7 +18,7 @@ class S3Runner(Runner):
|
|||||||
self.read_config.download_dir = update_download_dir_remote_url(
|
self.read_config.download_dir = update_download_dir_remote_url(
|
||||||
connector_name="s3",
|
connector_name="s3",
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
remote_url=remote_url,
|
remote_url=self.fsspec_config.remote_url, # type: ignore
|
||||||
logger=logger,
|
logger=logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -29,12 +27,11 @@ class S3Runner(Runner):
|
|||||||
access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous}
|
access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous}
|
||||||
if endpoint_url:
|
if endpoint_url:
|
||||||
access_kwargs["endpoint_url"] = endpoint_url
|
access_kwargs["endpoint_url"] = endpoint_url
|
||||||
|
|
||||||
|
connector_config = SimpleS3Config.from_dict(self.fsspec_config.to_dict()) # type: ignore
|
||||||
|
connector_config.access_kwargs = access_kwargs
|
||||||
source_doc_connector = S3SourceConnector( # type: ignore
|
source_doc_connector = S3SourceConnector( # type: ignore
|
||||||
connector_config=SimpleS3Config(
|
connector_config=connector_config,
|
||||||
path=remote_url,
|
|
||||||
recursive=recursive,
|
|
||||||
access_kwargs=access_kwargs,
|
|
||||||
),
|
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
)
|
)
|
||||||
|
@ -25,7 +25,7 @@ def s3_writer(
|
|||||||
return S3DestinationConnector(
|
return S3DestinationConnector(
|
||||||
write_config=WriteConfig(),
|
write_config=WriteConfig(),
|
||||||
connector_config=SimpleS3Config(
|
connector_config=SimpleS3Config(
|
||||||
path=remote_url,
|
remote_url=remote_url,
|
||||||
access_kwargs=access_kwargs,
|
access_kwargs=access_kwargs,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user