mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 22:23:24 +00:00
update ingest python doc (#1446)
### Description Updating the python version of the example docs to show how to run the same code that the CLI runs, but using python. Rather than copying the same command that would be run via the terminal and using the subprocess library to run it, this updates it to use the supported code exposed in the inference directory. For now only the wikipedia one has been updated to get some opinions on this before updating all other connector docs. Would close out https://github.com/Unstructured-IO/unstructured/issues/1445
This commit is contained in:
parent
89bd2faaf7
commit
9d81971fcb
@ -1,8 +1,15 @@
|
||||
## 0.10.19-dev4
|
||||
## 0.10.19-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images.
|
||||
* **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.
|
||||
|
||||
## 0.10.17-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -29,29 +29,21 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"airtable",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN",
|
||||
"--output-dir", "airtable-ingest-output"
|
||||
"--num-processes", "2",
|
||||
"--reprocess",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.airtable import airtable
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
airtable(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="airtable-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN"),
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -78,31 +70,23 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"airtable",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN",
|
||||
"--output-dir", "airtable-ingest-output"
|
||||
"--num-processes", "2",
|
||||
"--reprocess",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.airtable import airtable
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
airtable(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="airtable-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN"),
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -28,28 +28,20 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.azure import azure
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"azure",
|
||||
"--remote-url", "abfs://container1/",
|
||||
"--account-name", "azureunstructured1"
|
||||
"--output-dir", "/Output/Path/To/Files",
|
||||
"--num-processes", "2",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
azure(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="azure-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
remote_url="abfs://container1/",
|
||||
account_name="azureunstructured1",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -62,43 +54,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: shell
|
||||
|
||||
unstructured-ingest \
|
||||
azure \
|
||||
--remote-url abfs://container1/ \
|
||||
--account-name azureunstructured1 \
|
||||
--output-dir azure-ingest-output \
|
||||
--num-processes 2 \
|
||||
--partition-by-api \
|
||||
--api-key "<UNSTRUCTURED-API-KEY>"
|
||||
import os
|
||||
|
||||
.. tab:: Python
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.azure import azure
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"azure",
|
||||
"--remote-url", "abfs://container1/",
|
||||
"--account-name", "azureunstructured1"
|
||||
"--output-dir", "/Output/Path/To/Files",
|
||||
"--num-processes", "2",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
azure(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="azure-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
remote_url="abfs://container1/",
|
||||
account_name="azureunstructured1",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -29,29 +29,21 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.biomed import biomed
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"biomed",
|
||||
"--path", "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
|
||||
"--output-dir", "/Output/Path/To/Files",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--preserve-downloads",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
biomed(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(
|
||||
preserve_downloads=True,
|
||||
),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="biomed-ingest-output-path",
|
||||
num_processes=2,
|
||||
),
|
||||
path="oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -78,31 +70,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"biomed",
|
||||
"--path", "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
|
||||
"--output-dir", "/Output/Path/To/Files",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--preserve-downloads",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.biomed import biomed
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
biomed(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(
|
||||
preserve_downloads=True,
|
||||
),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="biomed-ingest-output-path",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
path="oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,30 +30,23 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"box",
|
||||
"--box_app_config", "$BOX_APP_CONFIG_PATH"
|
||||
"--remote-url", "box://utic-test-ingest-fixtures"
|
||||
"--output-dir", "box-output"
|
||||
"--num-processes", "2"
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.box import box
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
box(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="box-output",
|
||||
num_processes=2,
|
||||
),
|
||||
box_app_config=os.getenv("BOX_APP_CONFIG_PATH"),
|
||||
recursive=True,
|
||||
remote_url="box://utic-test-ingest-fixtures",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -81,32 +74,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"box",
|
||||
"--box_app_config", "$BOX_APP_CONFIG_PATH"
|
||||
"--remote-url", "box://utic-test-ingest-fixtures"
|
||||
"--output-dir", "box-output"
|
||||
"--num-processes", "2"
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.box import box
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
box(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="box-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
box_app_config=os.getenv("BOX_APP_CONFIG_PATH"),
|
||||
recursive=True,
|
||||
remote_url="box://utic-test-ingest-fixtures",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,30 +30,22 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.confluence import confluence
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"confluence",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--url", "https://unstructured-ingest-test.atlassian.net",
|
||||
"--user-email", "12345678@unstructured.io",
|
||||
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
|
||||
"--output-dir", "confluence-ingest-output",
|
||||
"--num-processes", "2",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
confluence(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="confluence-ingest-output",
|
||||
num_processes=2,
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
),
|
||||
url="https://unstructured-ingest-test.atlassian.net",
|
||||
user_email="12345678@unstructured.io",
|
||||
api_token="ABCDE1234ABDE1234ABCDE1234",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"confluence",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--url", "https://unstructured-ingest-test.atlassian.net",
|
||||
"--user-email", "12345678@unstructured.io",
|
||||
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
|
||||
"--output-dir", "confluence-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.confluence import confluence
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
confluence(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="confluence-ingest-output",
|
||||
num_processes=2,
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
url="https://unstructured-ingest-test.atlassian.net",
|
||||
user_email="12345678@unstructured.io",
|
||||
api_token="ABCDE1234ABDE1234ABCDE1234",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -29,30 +29,20 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.delta_table import delta_table
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"delta-table",
|
||||
"--table-uri", "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
|
||||
"--download-dir", "delta-table-ingest-download",
|
||||
"--output-dir", "delta-table-example",
|
||||
"--preserve-downloads",
|
||||
"--storage_options", "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
delta_table(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="delta-table-example",
|
||||
num_processes=2,
|
||||
),
|
||||
table_uri="s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
|
||||
storage_options="AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY"
|
||||
)
|
||||
|
||||
|
||||
Run via the API
|
||||
@ -79,32 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"delta-table",
|
||||
"--table-uri", "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
|
||||
"--download-dir", "delta-table-ingest-download",
|
||||
"--output-dir", "delta-table-example",
|
||||
"--preserve-downloads",
|
||||
"--storage_options", "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.delta_table import delta_table
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
delta_table(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="delta-table-example",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
table_uri="s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
|
||||
storage_options="AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY"
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,30 +30,26 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"discord",
|
||||
"--channels", "12345678",
|
||||
"--token", "$DISCORD_TOKEN",
|
||||
"--download-dir", "discord-ingest-download",
|
||||
"--output-dir", "discord-example",
|
||||
"--preserve-downloads",
|
||||
"--verbose",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.discord import discord
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
discord(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(
|
||||
download_dir="discord-ingest-download",
|
||||
preserve_downloads=True,
|
||||
),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="discord-example",
|
||||
num_processes=2,
|
||||
),
|
||||
channels=["12345678"],
|
||||
token=os.getenv("DISCORD_TOKEN"),
|
||||
period=None,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -81,32 +77,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"discord",
|
||||
"--channels", "12345678",
|
||||
"--token", "$DISCORD_TOKEN",
|
||||
"--download-dir", "discord-ingest-download",
|
||||
"--output-dir", "discord-example",
|
||||
"--preserve-downloads",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.discord import discord
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
discord(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(
|
||||
download_dir="discord-ingest-download",
|
||||
preserve_downloads=True,
|
||||
),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="discord-example",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
channels=["12345678"],
|
||||
token=os.getenv("DISCORD_TOKEN"),
|
||||
period=None,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,30 +30,23 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"dropbox",
|
||||
"--remote-url", "dropbox:// /",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--token", "$DROPBOX_TOKEN",
|
||||
"--num-processes", "2",
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.dropbox import dropbox
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
dropbox(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="dropbox-output",
|
||||
num_processes=2,
|
||||
),
|
||||
remote_url="dropbox:// /",
|
||||
token=os.getenv("DROPBOX_TOKEN"),
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -81,32 +74,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"dropbox",
|
||||
"--remote-url", "dropbox:// /",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--token", "$DROPBOX_TOKEN",
|
||||
"--num-processes", "2",
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.dropbox import dropbox
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
dropbox(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="dropbox-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
remote_url="dropbox:// /",
|
||||
token=os.getenv("DROPBOX_TOKEN"),
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,30 +30,22 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.elasticsearch import elasticsearch
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"elasticsearch",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--url", "http://localhost:9200",
|
||||
"--index-name", "movies",
|
||||
"--jq-query", "{ethnicity, director, plot}",
|
||||
"--output-dir", "elasticsearch-ingest-output",
|
||||
"--num-processes", "2"
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
elasticsearch(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="elasticsearch-ingest-output",
|
||||
num_processes=2,
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
),
|
||||
url="http://localhost:9200",
|
||||
index_name="movies",
|
||||
jq_query="{ethnicity, director, plot}",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"elasticsearch",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--url", "http://localhost:9200",
|
||||
"--index-name", "movies",
|
||||
"--jq-query", "{ethnicity, director, plot}",
|
||||
"--output-dir", "elasticsearch-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.elasticsearch import elasticsearch
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
elasticsearch(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="elasticsearch-ingest-output",
|
||||
num_processes=2,
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
url="http://localhost:9200",
|
||||
index_name="movies",
|
||||
jq_query="{ethnicity, director, plot}",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -29,29 +29,20 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.github import github
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"github",
|
||||
"--url", "Unstructured-IO/unstructured",
|
||||
"--git-branch", "main",
|
||||
"--output-dir", "github-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
github(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="github-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
url="Unstructured-IO/unstructured",
|
||||
git_branch="main",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -78,31 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"github",
|
||||
"--url", "Unstructured-IO/unstructured",
|
||||
"--git-branch", "main",
|
||||
"--output-dir", "github-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.github import github
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
github(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="github-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
url="Unstructured-IO/unstructured",
|
||||
git_branch="main",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -29,29 +29,20 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.gitlab import gitlab
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"gitlab",
|
||||
"--url", "Unstructured-IO/unstructured",
|
||||
"--git-branch", "v0.0.7",
|
||||
"--output-dir", "gitlab-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
gitlab(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="gitlab-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
url="https://gitlab.com/gitlab-com/content-sites/docsy-gitlab",
|
||||
git_branch="v0.0.7",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -78,31 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"gitlab",
|
||||
"--url", "Unstructured-IO/unstructured",
|
||||
"--git-branch", "v0.0.7",
|
||||
"--output-dir", "gitlab-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.gitlab import gitlab
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
gitlab(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="gitlab-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
url="https://gitlab.com/gitlab-com/content-sites/docsy-gitlab",
|
||||
git_branch="v0.0.7",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -29,29 +29,20 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.gcs import gcs
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"gcs",
|
||||
"--remote-url", "gs://utic-test-ingest-fixtures-public/",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--num-processes", "2",
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
gcs(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="gcs-output",
|
||||
num_processes=2,
|
||||
),
|
||||
remote_url="gs://utic-test-ingest-fixtures-public/",
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -76,29 +67,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"gcs",
|
||||
"--remote-url", "gs://utic-test-ingest-fixtures-public/",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--num-processes", "2",
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.gcs import gcs
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
gcs(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="gcs-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
remote_url="gs://utic-test-ingest-fixtures-public/",
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,28 +30,21 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.google_drive import gdrive
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"google-drive",
|
||||
"--drive-id", "<file or folder id>",
|
||||
"--service-account-key",, "Path/To/Your/Service/Account/Key"
|
||||
"--output-dir", "/Output/Path/To/Files",
|
||||
"--num-processes", "2",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
gdrive(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="google-drive-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
drive_id="POPULATE WITH FILE OR FOLDER ID",
|
||||
service_account_key="POPULATE WITH DRIVE SERVICE ACCOUNT KEY",
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -79,30 +72,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"google-drive",
|
||||
"--drive-id", "<file or folder id>",
|
||||
"--service-account-key",, "Path/To/Your/Service/Account/Key"
|
||||
"--output-dir", "/Output/Path/To/Files",
|
||||
"--num-processes", "2",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.google_drive import gdrive
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
gdrive(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="google-drive-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
drive_id="POPULATE WITH FILE OR FOLDER ID",
|
||||
service_account_key="POPULATE WITH DRIVE SERVICE ACCOUNT KEY",
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -31,30 +31,22 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.jira import jira
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"jira",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--url", "https://unstructured-jira-connector-test.atlassian.net",
|
||||
"--user-email", "12345678@unstructured.io",
|
||||
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
|
||||
"--output-dir", "jira-ingest-output",
|
||||
"--num-processes", "2",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
jira(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="jira-ingest-output",
|
||||
num_processes=2,
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
),
|
||||
url="https://unstructured-jira-connector-test.atlassian.net",
|
||||
user_email="12345678@unstructured.io",
|
||||
api_token="ABCDE1234ABDE1234ABCDE1234",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -82,32 +74,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"jira",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--url", "https://unstructured-jira-connector-test.atlassian.net",
|
||||
"--user-email", "12345678@unstructured.io",
|
||||
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
|
||||
"--output-dir", "jira-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.jira import jira
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
jira(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="jira-ingest-output",
|
||||
num_processes=2,
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
url="https://unstructured-jira-connector-test.atlassian.net",
|
||||
user_email="12345678@unstructured.io",
|
||||
api_token="ABCDE1234ABDE1234ABCDE1234",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -23,29 +23,20 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.local import local
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"local",
|
||||
"--input-path", "example-docs",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--num-processes", "2",
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
local(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="local-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
input_path="example-docs",
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -72,31 +63,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"local",
|
||||
"--input-path", "example-docs",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--num-processes", "2",
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.local import local
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
local(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="local-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
input_path="example-docs",
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,30 +30,22 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.notion import notion
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"notion",
|
||||
"--api-key", "<Notion api key>",
|
||||
"--output-dir", "notion-ingest-output",
|
||||
"--page-ids", "<Comma delimited list of page ids to process>",
|
||||
"--database-ids", ""<Comma delimited list of database ids to process>"",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
notion(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="notion-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
api_key="POPULATE API KEY",
|
||||
page_ids=["LIST", "OF", "PAGE", "IDS"],
|
||||
database_ids=["LIST", "OF", "DATABASE", "IDS"],
|
||||
recursive=False,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"notion",
|
||||
"--api-key", "<Notion api key>",
|
||||
"--output-dir", "notion-ingest-output",
|
||||
"--page-ids", "<Comma delimited list of page ids to process>",
|
||||
"--database-ids", ""<Comma delimited list of database ids to process>"",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.notion import notion
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
notion(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="notion-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
api_key="POPULATE API KEY",
|
||||
page_ids=["LIST", "OF", "PAGE", "IDS"],
|
||||
database_ids=["LIST", "OF", "DATABASE", "IDS"],
|
||||
recursive=False,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -33,33 +33,25 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.onedrive import onedrive
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"onedrive",
|
||||
"--client-id", "<Azure AD app client-id>",
|
||||
"--client-cred", "<Azure AD app client-secret>",
|
||||
"--authority-url", "<Authority URL, default is https://login.microsoftonline.com>",
|
||||
"--tenant", "<Azure AD tenant_id, default is 'common'>",
|
||||
"--user-pname", "<Azure AD principal name, in most cases is the email linked to the drive>",
|
||||
"--path", "<Path to start parsing files from>",
|
||||
"--output-dir", "onedrive-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose"
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
onedrive(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="onedrive-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
client_id="<Azure AD app client-id>",
|
||||
client_cred="<Azure AD app client-secret>",
|
||||
authority_url="<Authority URL, default is https://login.microsoftonline.com>",
|
||||
tenant="<Azure AD tenant_id, default is 'common'>",
|
||||
user_pname="<Azure AD principal name, in most cases is the email linked to the drive>",
|
||||
path="<Path to start parsing files from>",
|
||||
recursive=False,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -90,35 +82,29 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"onedrive",
|
||||
"--client-id", "<Azure AD app client-id>",
|
||||
"--client-cred", "<Azure AD app client-secret>",
|
||||
"--authority-url", "<Authority URL, default is https://login.microsoftonline.com>",
|
||||
"--tenant", "<Azure AD tenant_id, default is 'common'>",
|
||||
"--user-pname", "<Azure AD principal name, in most cases is the email linked to the drive>",
|
||||
"--path", "<Path to start parsing files from>",
|
||||
"--output-dir", "onedrive-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.onedrive import onedrive
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
onedrive(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="onedrive-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
client_id="<Azure AD app client-id>",
|
||||
client_cred="<Azure AD app client-secret>",
|
||||
authority_url="<Authority URL, default is https://login.microsoftonline.com>",
|
||||
tenant="<Azure AD tenant_id, default is 'common'>",
|
||||
user_pname="<Azure AD principal name, in most cases is the email linked to the drive>",
|
||||
path="<Path to start parsing files from>",
|
||||
recursive=False,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -33,33 +33,26 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"outlook",
|
||||
"--client-id", "$MS_CLIENT_ID",
|
||||
"--client-cred", "$MS_CLIENT_CRED",
|
||||
"--tenant", "<Azure AD tenant_id, default is 'common'>",
|
||||
"--user-email", "$MS_USER_EMAIL",
|
||||
"--outlook-folders", "Inbox,Sent Items",
|
||||
"--output-dir", "onedrive-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.outlook import outlook
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
outlook(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="outlook-output",
|
||||
num_processes=2,
|
||||
),
|
||||
client_id=os.getenv("MS_CLIENT_ID"),
|
||||
client_cred=os.getenv("MS_CLIENT_CRED"),
|
||||
tenant=os.getenv("MS_TENANT_ID"),
|
||||
user_email=os.getenv("MS_USER_EMAIL"),
|
||||
outlook_folders=["Inbox", "Sent Items"],
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -86,31 +79,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"airtable",
|
||||
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
|
||||
"--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN",
|
||||
"--output-dir", "airtable-ingest-output"
|
||||
"--num-processes", "2",
|
||||
"--reprocess",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.outlook import outlook
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
outlook(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="outlook-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
client_id=os.getenv("MS_CLIENT_ID"),
|
||||
client_cred=os.getenv("MS_CLIENT_CRED"),
|
||||
tenant=os.getenv("MS_TENANT_ID"),
|
||||
user_email=os.getenv("MS_USER_EMAIL"),
|
||||
outlook_folders=["Inbox", "Sent Items"],
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -33,33 +33,24 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.reddit import reddit
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"reddit",
|
||||
"--subreddit-name", "machinelearning",
|
||||
"--client-id", "<client id here>",
|
||||
"--client-secret", "<client secret here>",
|
||||
"--user-agent", "Unstructured Ingest Subreddit fetcher by \\u\\...",
|
||||
"--search-query", "Unstructured",
|
||||
"--num-posts", "10",
|
||||
"--output-dir", "reddit-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose"
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
reddit(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="reddit-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
subreddit_name="machinelearning",
|
||||
client_id="<client id here>",
|
||||
client_secret="<client secret here>",
|
||||
user_agent=r"Unstructured Ingest Subreddit fetcher by \\u\...",
|
||||
search_query="Unstructured",
|
||||
num_posts=10,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -90,35 +81,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"reddit",
|
||||
"--subreddit-name", "machinelearning",
|
||||
"--client-id", "<client id here>",
|
||||
"--client-secret", "<client secret here>",
|
||||
"--user-agent", "Unstructured Ingest Subreddit fetcher by \\u\\...",
|
||||
"--search-query", "Unstructured",
|
||||
"--num-posts", "10",
|
||||
"--output-dir", "reddit-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose"
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.reddit import reddit
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
reddit(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="reddit-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
subreddit_name="machinelearning",
|
||||
client_id="<client id here>",
|
||||
client_secret="<client secret here>",
|
||||
user_agent=r"Unstructured Ingest Subreddit fetcher by \\u\...",
|
||||
search_query="Unstructured",
|
||||
num_posts=10,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -28,28 +28,20 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.s3 import s3
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"s3",
|
||||
"--remote-url", "s3://utic-dev-tech-fixtures/small-pdf-set/",
|
||||
"--anonymous",
|
||||
"--output-dir", "s3-small-batch-output",
|
||||
"--num-processes", "2"
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
s3(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="s3-small-batch-output",
|
||||
num_processes=2,
|
||||
),
|
||||
remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/",
|
||||
anonymous=True,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -75,30 +67,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"s3",
|
||||
"--remote-url", "s3://utic-dev-tech-fixtures/small-pdf-set/",
|
||||
"--anonymous",
|
||||
"--output-dir", "s3-small-batch-output",
|
||||
"--num-processes", "2",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.s3 import s3
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
s3(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="s3-small-batch-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/",
|
||||
anonymous=True,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -32,32 +32,25 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"salesforce",
|
||||
"--username" "$SALESFORCE_USERNAME"
|
||||
"--consumer-key" "$SALESFORCE_CONSUMER_KEY"
|
||||
"--private-key-path" "$SALESFORCE_PRIVATE_KEY_PATH"
|
||||
"--categories" "EmailMessage,Account,Lead,Case,Campaign"
|
||||
"--output-dir" "salesforce-output"
|
||||
"--num-processes", "2"
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.salesforce import salesforce
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
salesforce(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="salesforce-output",
|
||||
num_processes=2,
|
||||
),
|
||||
username=os.getenv("SALESFORCE_USERNAME"),
|
||||
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
|
||||
private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"),
|
||||
categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"],
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -87,34 +80,27 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"salesforce",
|
||||
"--username" "$SALESFORCE_USERNAME"
|
||||
"--consumer-key" "$SALESFORCE_CONSUMER_KEY"
|
||||
"--private-key-path" "$SALESFORCE_PRIVATE_KEY_PATH"
|
||||
"--categories" "EmailMessage,Account,Lead,Case,Campaign"
|
||||
"--output-dir" "salesforce-output"
|
||||
"--num-processes", "2"
|
||||
"--recursive",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.salesforce import salesforce
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
salesforce(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="salesforce-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
username=os.getenv("SALESFORCE_USERNAME"),
|
||||
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
|
||||
private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"),
|
||||
categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"],
|
||||
recursive=True,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -25,37 +25,32 @@ Run Locally
|
||||
--files-only "Flag to process only files within the site(s)" \
|
||||
--output-dir sharepoint-ingest-output \
|
||||
--num-processes 2 \
|
||||
--path "Shared Documents" \
|
||||
--verbose
|
||||
|
||||
.. tab:: Python
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.sharepoint import sharepoint
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"sharepoint",
|
||||
"--client-id", "<Microsoft Sharepoint app client-id>",
|
||||
"--client-cred", "<Microsoft Sharepoint app client-secret>",
|
||||
"--site", "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>",
|
||||
"--files-only", "Flag to process only files within the site(s)",
|
||||
"--output-dir", "sharepoint-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
sharepoint(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="sharepoint-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
client_id="<Microsoft Sharepoint app client-id>",
|
||||
client_cred="<Microsoft Sharepoint app client-secret>",
|
||||
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
||||
# Flag to process only files within the site(s)
|
||||
files_only=True,
|
||||
path="Shared Documents",
|
||||
recursive=False,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -77,6 +72,7 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
--output-dir sharepoint-ingest-output \
|
||||
--num-processes 2 \
|
||||
--verbose \
|
||||
--path "Shared Documents" \
|
||||
--partition-by-api \
|
||||
--api-key "<UNSTRUCTURED-API-KEY>"
|
||||
|
||||
@ -84,33 +80,29 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"sharepoint",
|
||||
"--client-id", "<Microsoft Sharepoint app client-id>",
|
||||
"--client-cred", "<Microsoft Sharepoint app client-secret>",
|
||||
"--site", "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>",
|
||||
"--files-only", "Flag to process only files within the site(s)",
|
||||
"--output-dir", "sharepoint-ingest-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.sharepoint import sharepoint
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
sharepoint(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="sharepoint-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
client_id="<Microsoft Sharepoint app client-id>",
|
||||
client_cred="<Microsoft Sharepoint app client-secret>",
|
||||
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
||||
# Flag to process only files within the site(s)
|
||||
files_only=True,
|
||||
path="Shared Documents",
|
||||
recursive=False,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -30,30 +30,22 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.slack import slack
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"slack",
|
||||
"--channels", "12345678",
|
||||
"--token", "12345678",
|
||||
"--download-dir", "slack-ingest-download",
|
||||
"--output-dir", "slack-ingest-output",
|
||||
"--start-date", "2023-04-01T01:00:00-08:00",
|
||||
"--end-date", "2023-04-02"
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
slack(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="slack-ingest-download",
|
||||
num_processes=2,
|
||||
),
|
||||
channels=["12345678"],
|
||||
token="12345678",
|
||||
start_date="2023-04-01T01:00:00-08:00",
|
||||
end_date="2023-04-02,",
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"slack",
|
||||
"--channels", "12345678",
|
||||
"--token", "12345678",
|
||||
"--download-dir", "slack-ingest-download",
|
||||
"--output-dir", "slack-ingest-output",
|
||||
"--start-date", "2023-04-01T01:00:00-08:00",
|
||||
"--end-date", "2023-04-02",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.slack import slack
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
slack(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="slack-ingest-download",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
channels=["12345678"],
|
||||
token="12345678",
|
||||
start_date="2023-04-01T01:00:00-08:00",
|
||||
end_date="2023-04-02,",
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -28,28 +28,21 @@ Run Locally
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
from unstructured.ingest.runner.wikipedia import wikipedia
|
||||
from unstructured.ingest.interfaces import ReadConfig, PartitionConfig
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"wikipedia",
|
||||
"--page-title", "Open Source Software",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
]
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
wikipedia(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="wikipedia-ingest-output",
|
||||
num_processes=2
|
||||
),
|
||||
page_title="Open Source Software",
|
||||
auto_suggest=False,
|
||||
)
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
@ -75,30 +68,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
|
||||
.. code:: python
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
command = [
|
||||
"unstructured-ingest",
|
||||
"wikipedia",
|
||||
"--page-title", "Open Source Software",
|
||||
"--output-dir", "dropbox-output",
|
||||
"--num-processes", "2",
|
||||
"--verbose",
|
||||
"--partition-by-api",
|
||||
"--api-key", "<UNSTRUCTURED-API-KEY>",
|
||||
]
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
|
||||
from unstructured.ingest.runner.wikipedia import wikipedia
|
||||
|
||||
# Run the command
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
|
||||
# Print output
|
||||
if process.returncode == 0:
|
||||
print('Command executed successfully. Output:')
|
||||
print(output.decode())
|
||||
else:
|
||||
print('Command failed. Error:')
|
||||
print(error.decode())
|
||||
if __name__ == "__main__":
|
||||
wikipedia(
|
||||
verbose=True,
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
output_dir="wikipedia-ingest-output",
|
||||
num_processes=2,
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
page_title="Open Source Software",
|
||||
auto_suggest=False,
|
||||
)
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
|
||||
@ -25,4 +25,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--files-only "Flag to process only files within the site(s)" \
|
||||
--output-dir sharepoint-ingest-output \
|
||||
--num-processes 2 \
|
||||
--path "Shared Documents" \
|
||||
--verbose
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.19-dev4" # pragma: no cover
|
||||
__version__ = "0.10.19-dev5" # pragma: no cover
|
||||
|
||||
@ -10,11 +10,11 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def airtable(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
personal_access_token: str,
|
||||
list_of_paths: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
list_of_paths: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -9,14 +9,14 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def azure(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
account_name: t.Optional[str],
|
||||
account_key: t.Optional[str],
|
||||
connection_string: t.Optional[str],
|
||||
remote_url: str,
|
||||
recursive: bool,
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -13,13 +13,13 @@ def biomed(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
path: t.Optional[str],
|
||||
api_id: t.Optional[str],
|
||||
api_from: t.Optional[str],
|
||||
api_until: t.Optional[str],
|
||||
max_retries: int,
|
||||
max_request_time: int,
|
||||
decay: float,
|
||||
path: t.Optional[str] = None,
|
||||
api_id: t.Optional[str] = None,
|
||||
api_from: t.Optional[str] = None,
|
||||
api_until: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def box(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
remote_url: str,
|
||||
recursive: bool,
|
||||
box_app_config: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
box_app_config: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,7 +10,6 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def confluence(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
url: str,
|
||||
@ -18,6 +17,7 @@ def confluence(
|
||||
api_token: str,
|
||||
max_num_of_spaces: int,
|
||||
max_num_of_docs_from_each_space: int,
|
||||
verbose: bool = False,
|
||||
spaces: t.Optional[t.List[str]] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
|
||||
@ -11,12 +11,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def delta_table(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
table_uri: t.Union[str, Path],
|
||||
version: t.Optional[int] = None,
|
||||
storage_options: t.Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
without_files: bool = False,
|
||||
columns: t.Optional[t.List[str]] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
|
||||
@ -10,12 +10,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def discord(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
channels: t.List[str],
|
||||
token: str,
|
||||
period: t.Optional[int],
|
||||
verbose: bool = False,
|
||||
period: t.Optional[int] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def dropbox(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
remote_url: str,
|
||||
recursive: bool,
|
||||
token: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
token: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,12 +10,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def elasticsearch(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
url: str,
|
||||
index_name: str,
|
||||
jq_query: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
jq_query: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -11,11 +11,11 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def fsspec(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
remote_url: str,
|
||||
recursive: bool,
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def gcs(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
remote_url: str,
|
||||
recursive: bool,
|
||||
token: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
token: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def github(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
url: str,
|
||||
git_branch: str,
|
||||
git_access_token: t.Optional[str],
|
||||
git_file_glob: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
git_access_token: t.Optional[str] = None,
|
||||
git_file_glob: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def gitlab(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
url: str,
|
||||
git_branch: str,
|
||||
git_access_token: t.Optional[str],
|
||||
git_file_glob: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
git_access_token: t.Optional[str] = None,
|
||||
git_file_glob: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def gdrive(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
service_account_key: str,
|
||||
recursive: bool,
|
||||
drive_id: str,
|
||||
extension: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
extension: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,15 +10,15 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def jira(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
url: str,
|
||||
user_email: str,
|
||||
api_token: str,
|
||||
projects: t.Optional[t.List[str]],
|
||||
boards: t.Optional[t.List[str]],
|
||||
issues: t.Optional[t.List[str]],
|
||||
verbose: bool = False,
|
||||
projects: t.Optional[t.List[str]] = None,
|
||||
boards: t.Optional[t.List[str]] = None,
|
||||
issues: t.Optional[t.List[str]] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -8,12 +8,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def local(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
input_path: str,
|
||||
recursive: bool,
|
||||
file_glob: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
file_glob: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -11,11 +11,11 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def notion(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
api_key: str,
|
||||
recursive: bool,
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
page_ids: t.Optional[t.List[str]] = None,
|
||||
database_ids: t.Optional[t.List[str]] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
|
||||
@ -10,16 +10,16 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def onedrive(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
tenant: str,
|
||||
user_pname: str,
|
||||
client_id: str,
|
||||
client_cred: str,
|
||||
authority_url: t.Optional[str],
|
||||
path: t.Optional[str],
|
||||
recursive: bool,
|
||||
verbose: bool = False,
|
||||
authority_url: t.Optional[str] = None,
|
||||
path: t.Optional[str] = None,
|
||||
recursive: bool = False,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,15 +10,15 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def outlook(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
user_email: str,
|
||||
client_id: t.Optional[str],
|
||||
client_cred: t.Optional[str],
|
||||
tenant: t.Optional[str],
|
||||
authority_url: t.Optional[str],
|
||||
recursive: bool,
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
client_id: t.Optional[str] = None,
|
||||
client_cred: t.Optional[str] = None,
|
||||
tenant: t.Optional[str] = None,
|
||||
authority_url: t.Optional[str] = None,
|
||||
outlook_folders: t.Optional[t.List[str]] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
|
||||
@ -10,15 +10,15 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def reddit(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
subreddit_name: str,
|
||||
client_id: t.Optional[str],
|
||||
client_secret: t.Optional[str],
|
||||
user_agent: str,
|
||||
search_query: t.Optional[str],
|
||||
num_posts: int,
|
||||
verbose: bool = False,
|
||||
client_id: t.Optional[str] = None,
|
||||
client_secret: t.Optional[str] = None,
|
||||
search_query: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def s3(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
remote_url: str,
|
||||
recursive: bool,
|
||||
anonymous: bool,
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
anonymous: bool = False,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,14 +10,14 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def salesforce(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
recursive: bool,
|
||||
username: str,
|
||||
consumer_key: str,
|
||||
private_key_path: str,
|
||||
categories: t.List[str],
|
||||
verbose: bool = False,
|
||||
recursive: bool = False,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -14,9 +14,9 @@ class SharePoint(Runner):
|
||||
site: str,
|
||||
client_id: str,
|
||||
client_cred: str,
|
||||
files_only: bool,
|
||||
path: str,
|
||||
recursive: bool,
|
||||
files_only: bool = False,
|
||||
recursive: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
writer_kwargs = self.writer_kwargs if self.writer_kwargs else {}
|
||||
|
||||
@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def slack(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
channels: t.List[str],
|
||||
token: str,
|
||||
start_date: t.Optional[str],
|
||||
end_date: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
start_date: t.Optional[str] = None,
|
||||
end_date: t.Optional[str] = None,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
@ -10,11 +10,11 @@ from unstructured.ingest.runner.writers import writer_map
|
||||
|
||||
|
||||
def wikipedia(
|
||||
verbose: bool,
|
||||
read_config: ReadConfig,
|
||||
partition_config: PartitionConfig,
|
||||
page_title: str,
|
||||
auto_suggest: bool,
|
||||
verbose: bool = False,
|
||||
auto_suggest: bool = False,
|
||||
writer_type: t.Optional[str] = None,
|
||||
writer_kwargs: t.Optional[dict] = None,
|
||||
**kwargs,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user