update ingest python doc (#1446)

### Description
Updating the python version of the example docs to show how to run the
same code that the CLI runs, but using python. Rather than copying the
same command that would be run via the terminal and using the subprocess
library to run it, this updates it to use the supported code exposed in
the inference directory.

For now only the wikipedia one has been updated to get some opinions on
this before updating all other connector docs.

Would close out
https://github.com/Unstructured-IO/unstructured/issues/1445
This commit is contained in:
Roman Isecke 2023-10-03 10:01:41 -04:00 committed by GitHub
parent 89bd2faaf7
commit 9d81971fcb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
52 changed files with 859 additions and 1195 deletions

View File

@ -1,8 +1,15 @@
## 0.10.19-dev4
## 0.10.19-dev5
### Enhancements
* **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images.
* **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.
## 0.10.17-dev3
### Enhancements
* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
### Features

View File

@ -29,29 +29,21 @@ Run Locally
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"airtable",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN",
"--output-dir", "airtable-ingest-output"
"--num-processes", "2",
"--reprocess",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.airtable import airtable
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
airtable(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="airtable-ingest-output",
num_processes=2,
),
personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN"),
)
Run via the API
---------------
@ -78,31 +70,23 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"airtable",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN",
"--output-dir", "airtable-ingest-output"
"--num-processes", "2",
"--reprocess",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.airtable import airtable
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
airtable(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="airtable-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN"),
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -28,28 +28,20 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.azure import azure
command = [
"unstructured-ingest",
"azure",
"--remote-url", "abfs://container1/",
"--account-name", "azureunstructured1"
"--output-dir", "/Output/Path/To/Files",
"--num-processes", "2",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
azure(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="azure-ingest-output",
num_processes=2,
),
remote_url="abfs://container1/",
account_name="azureunstructured1",
)
Run via the API
---------------
@ -62,43 +54,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: shell
unstructured-ingest \
azure \
--remote-url abfs://container1/ \
--account-name azureunstructured1 \
--output-dir azure-ingest-output \
--num-processes 2 \
--partition-by-api \
--api-key "<UNSTRUCTURED-API-KEY>"
import os
.. tab:: Python
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.azure import azure
.. code:: python
import subprocess
command = [
"unstructured-ingest",
"azure",
"--remote-url", "abfs://container1/",
"--account-name", "azureunstructured1"
"--output-dir", "/Output/Path/To/Files",
"--num-processes", "2",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
azure(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="azure-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
remote_url="abfs://container1/",
account_name="azureunstructured1",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -29,29 +29,21 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.biomed import biomed
command = [
"unstructured-ingest",
"biomed",
"--path", "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
"--output-dir", "/Output/Path/To/Files",
"--num-processes", "2",
"--verbose",
"--preserve-downloads",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
biomed(
verbose=True,
read_config=ReadConfig(
preserve_downloads=True,
),
partition_config=PartitionConfig(
output_dir="biomed-ingest-output-path",
num_processes=2,
),
path="oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
)
Run via the API
---------------
@ -78,31 +70,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"biomed",
"--path", "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
"--output-dir", "/Output/Path/To/Files",
"--num-processes", "2",
"--verbose",
"--preserve-downloads",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.biomed import biomed
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
biomed(
verbose=True,
read_config=ReadConfig(
preserve_downloads=True,
),
partition_config=PartitionConfig(
output_dir="biomed-ingest-output-path",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
path="oa_pdf/07/07/sbaa031.073.PMC7234218.pdf",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,30 +30,23 @@ Run Locally
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"box",
"--box_app_config", "$BOX_APP_CONFIG_PATH"
"--remote-url", "box://utic-test-ingest-fixtures"
"--output-dir", "box-output"
"--num-processes", "2"
"--recursive",
"--verbose",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.box import box
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
box(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="box-output",
num_processes=2,
),
box_app_config=os.getenv("BOX_APP_CONFIG_PATH"),
recursive=True,
remote_url="box://utic-test-ingest-fixtures",
)
Run via the API
---------------
@ -81,32 +74,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"box",
"--box_app_config", "$BOX_APP_CONFIG_PATH"
"--remote-url", "box://utic-test-ingest-fixtures"
"--output-dir", "box-output"
"--num-processes", "2"
"--recursive",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.box import box
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
box(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="box-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
box_app_config=os.getenv("BOX_APP_CONFIG_PATH"),
recursive=True,
remote_url="box://utic-test-ingest-fixtures",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,30 +30,22 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.confluence import confluence
command = [
"unstructured-ingest",
"confluence",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--url", "https://unstructured-ingest-test.atlassian.net",
"--user-email", "12345678@unstructured.io",
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
"--output-dir", "confluence-ingest-output",
"--num-processes", "2",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
confluence(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="confluence-ingest-output",
num_processes=2,
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
),
url="https://unstructured-ingest-test.atlassian.net",
user_email="12345678@unstructured.io",
api_token="ABCDE1234ABDE1234ABCDE1234",
)
Run via the API
---------------
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"confluence",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--url", "https://unstructured-ingest-test.atlassian.net",
"--user-email", "12345678@unstructured.io",
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
"--output-dir", "confluence-ingest-output",
"--num-processes", "2",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.confluence import confluence
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
confluence(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="confluence-ingest-output",
num_processes=2,
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
url="https://unstructured-ingest-test.atlassian.net",
user_email="12345678@unstructured.io",
api_token="ABCDE1234ABDE1234ABCDE1234",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -29,30 +29,20 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.delta_table import delta_table
command = [
"unstructured-ingest",
"delta-table",
"--table-uri", "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
"--download-dir", "delta-table-ingest-download",
"--output-dir", "delta-table-example",
"--preserve-downloads",
"--storage_options", "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
delta_table(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="delta-table-example",
num_processes=2,
),
table_uri="s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
storage_options="AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY"
)
Run via the API
@ -79,32 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"delta-table",
"--table-uri", "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
"--download-dir", "delta-table-ingest-download",
"--output-dir", "delta-table-example",
"--preserve-downloads",
"--storage_options", "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.delta_table import delta_table
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
delta_table(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="delta-table-example",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
table_uri="s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/",
storage_options="AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY"
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,30 +30,26 @@ Run Locally
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"discord",
"--channels", "12345678",
"--token", "$DISCORD_TOKEN",
"--download-dir", "discord-ingest-download",
"--output-dir", "discord-example",
"--preserve-downloads",
"--verbose",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.discord import discord
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
discord(
verbose=True,
read_config=ReadConfig(
download_dir="discord-ingest-download",
preserve_downloads=True,
),
partition_config=PartitionConfig(
output_dir="discord-example",
num_processes=2,
),
channels=["12345678"],
token=os.getenv("DISCORD_TOKEN"),
period=None,
)
Run via the API
---------------
@ -81,32 +77,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"discord",
"--channels", "12345678",
"--token", "$DISCORD_TOKEN",
"--download-dir", "discord-ingest-download",
"--output-dir", "discord-example",
"--preserve-downloads",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.discord import discord
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
discord(
verbose=True,
read_config=ReadConfig(
download_dir="discord-ingest-download",
preserve_downloads=True,
),
partition_config=PartitionConfig(
output_dir="discord-example",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
channels=["12345678"],
token=os.getenv("DISCORD_TOKEN"),
period=None,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,30 +30,23 @@ Run Locally
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"dropbox",
"--remote-url", "dropbox:// /",
"--output-dir", "dropbox-output",
"--token", "$DROPBOX_TOKEN",
"--num-processes", "2",
"--recursive",
"--verbose",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.dropbox import dropbox
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
dropbox(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="dropbox-output",
num_processes=2,
),
remote_url="dropbox:// /",
token=os.getenv("DROPBOX_TOKEN"),
recursive=True,
)
Run via the API
---------------
@ -81,32 +74,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"dropbox",
"--remote-url", "dropbox:// /",
"--output-dir", "dropbox-output",
"--token", "$DROPBOX_TOKEN",
"--num-processes", "2",
"--recursive",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.dropbox import dropbox
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
dropbox(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="dropbox-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
remote_url="dropbox:// /",
token=os.getenv("DROPBOX_TOKEN"),
recursive=True,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,30 +30,22 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.elasticsearch import elasticsearch
command = [
"unstructured-ingest",
"elasticsearch",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--url", "http://localhost:9200",
"--index-name", "movies",
"--jq-query", "{ethnicity, director, plot}",
"--output-dir", "elasticsearch-ingest-output",
"--num-processes", "2"
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
elasticsearch(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="elasticsearch-ingest-output",
num_processes=2,
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
),
url="http://localhost:9200",
index_name="movies",
jq_query="{ethnicity, director, plot}",
)
Run via the API
---------------
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"elasticsearch",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--url", "http://localhost:9200",
"--index-name", "movies",
"--jq-query", "{ethnicity, director, plot}",
"--output-dir", "elasticsearch-ingest-output",
"--num-processes", "2",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.elasticsearch import elasticsearch
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
elasticsearch(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="elasticsearch-ingest-output",
num_processes=2,
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
url="http://localhost:9200",
index_name="movies",
jq_query="{ethnicity, director, plot}",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -29,29 +29,20 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.github import github
command = [
"unstructured-ingest",
"github",
"--url", "Unstructured-IO/unstructured",
"--git-branch", "main",
"--output-dir", "github-ingest-output",
"--num-processes", "2",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
github(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="github-ingest-output",
num_processes=2,
),
url="Unstructured-IO/unstructured",
git_branch="main",
)
Run via the API
---------------
@ -78,31 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"github",
"--url", "Unstructured-IO/unstructured",
"--git-branch", "main",
"--output-dir", "github-ingest-output",
"--num-processes", "2",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.github import github
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
github(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="github-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
url="Unstructured-IO/unstructured",
git_branch="main",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -29,29 +29,20 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.gitlab import gitlab
command = [
"unstructured-ingest",
"gitlab",
"--url", "Unstructured-IO/unstructured",
"--git-branch", "v0.0.7",
"--output-dir", "gitlab-ingest-output",
"--num-processes", "2",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
gitlab(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="gitlab-ingest-output",
num_processes=2,
),
url="https://gitlab.com/gitlab-com/content-sites/docsy-gitlab",
git_branch="v0.0.7",
)
Run via the API
---------------
@ -78,31 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"gitlab",
"--url", "Unstructured-IO/unstructured",
"--git-branch", "v0.0.7",
"--output-dir", "gitlab-ingest-output",
"--num-processes", "2",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.gitlab import gitlab
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
gitlab(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="gitlab-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
url="https://gitlab.com/gitlab-com/content-sites/docsy-gitlab",
git_branch="v0.0.7",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -29,29 +29,20 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.gcs import gcs
command = [
"unstructured-ingest",
"gcs",
"--remote-url", "gs://utic-test-ingest-fixtures-public/",
"--output-dir", "dropbox-output",
"--num-processes", "2",
"--recursive",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
gcs(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="gcs-output",
num_processes=2,
),
remote_url="gs://utic-test-ingest-fixtures-public/",
recursive=True,
)
Run via the API
---------------
@ -76,29 +67,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"gcs",
"--remote-url", "gs://utic-test-ingest-fixtures-public/",
"--output-dir", "dropbox-output",
"--num-processes", "2",
"--recursive",
"--verbose",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.gcs import gcs
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
gcs(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="gcs-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
remote_url="gs://utic-test-ingest-fixtures-public/",
recursive=True,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,28 +30,21 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.google_drive import gdrive
command = [
"unstructured-ingest",
"google-drive",
"--drive-id", "<file or folder id>",
"--service-account-key",, "Path/To/Your/Service/Account/Key"
"--output-dir", "/Output/Path/To/Files",
"--num-processes", "2",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
gdrive(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="google-drive-ingest-output",
num_processes=2,
),
drive_id="POPULATE WITH FILE OR FOLDER ID",
service_account_key="POPULATE WITH DRIVE SERVICE ACCOUNT KEY",
recursive=True,
)
Run via the API
---------------
@ -79,30 +72,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"google-drive",
"--drive-id", "<file or folder id>",
"--service-account-key",, "Path/To/Your/Service/Account/Key"
"--output-dir", "/Output/Path/To/Files",
"--num-processes", "2",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.google_drive import gdrive
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
gdrive(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="google-drive-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
drive_id="POPULATE WITH FILE OR FOLDER ID",
service_account_key="POPULATE WITH DRIVE SERVICE ACCOUNT KEY",
recursive=True,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -31,30 +31,22 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.jira import jira
command = [
"unstructured-ingest",
"jira",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--url", "https://unstructured-jira-connector-test.atlassian.net",
"--user-email", "12345678@unstructured.io",
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
"--output-dir", "jira-ingest-output",
"--num-processes", "2",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
jira(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="jira-ingest-output",
num_processes=2,
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
),
url="https://unstructured-jira-connector-test.atlassian.net",
user_email="12345678@unstructured.io",
api_token="ABCDE1234ABDE1234ABCDE1234",
)
Run via the API
---------------
@ -82,32 +74,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"jira",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--url", "https://unstructured-jira-connector-test.atlassian.net",
"--user-email", "12345678@unstructured.io",
"--api-token", "ABCDE1234ABDE1234ABCDE1234",
"--output-dir", "jira-ingest-output",
"--num-processes", "2",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.jira import jira
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
jira(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="jira-ingest-output",
num_processes=2,
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
url="https://unstructured-jira-connector-test.atlassian.net",
user_email="12345678@unstructured.io",
api_token="ABCDE1234ABDE1234ABCDE1234",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -23,29 +23,20 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.local import local
command = [
"unstructured-ingest",
"local",
"--input-path", "example-docs",
"--output-dir", "dropbox-output",
"--num-processes", "2",
"--recursive",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
local(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="local-ingest-output",
num_processes=2,
),
input_path="example-docs",
recursive=True,
)
Run via the API
---------------
@ -72,31 +63,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"local",
"--input-path", "example-docs",
"--output-dir", "dropbox-output",
"--num-processes", "2",
"--recursive",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.local import local
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
local(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="local-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
input_path="example-docs",
recursive=True,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,30 +30,22 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.notion import notion
command = [
"unstructured-ingest",
"notion",
"--api-key", "<Notion api key>",
"--output-dir", "notion-ingest-output",
"--page-ids", "<Comma delimited list of page ids to process>",
"--database-ids", ""<Comma delimited list of database ids to process>"",
"--num-processes", "2",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
notion(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="notion-ingest-output",
num_processes=2,
),
api_key="POPULATE API KEY",
page_ids=["LIST", "OF", "PAGE", "IDS"],
database_ids=["LIST", "OF", "DATABASE", "IDS"],
recursive=False,
)
Run via the API
---------------
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"notion",
"--api-key", "<Notion api key>",
"--output-dir", "notion-ingest-output",
"--page-ids", "<Comma delimited list of page ids to process>",
"--database-ids", ""<Comma delimited list of database ids to process>"",
"--num-processes", "2",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.notion import notion
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
notion(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="notion-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
api_key="POPULATE API KEY",
page_ids=["LIST", "OF", "PAGE", "IDS"],
database_ids=["LIST", "OF", "DATABASE", "IDS"],
recursive=False,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -33,33 +33,25 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.onedrive import onedrive
command = [
"unstructured-ingest",
"onedrive",
"--client-id", "<Azure AD app client-id>",
"--client-cred", "<Azure AD app client-secret>",
"--authority-url", "<Authority URL, default is https://login.microsoftonline.com>",
"--tenant", "<Azure AD tenant_id, default is 'common'>",
"--user-pname", "<Azure AD principal name, in most cases is the email linked to the drive>",
"--path", "<Path to start parsing files from>",
"--output-dir", "onedrive-ingest-output",
"--num-processes", "2",
"--verbose"
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
onedrive(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="onedrive-ingest-output",
num_processes=2,
),
client_id="<Azure AD app client-id>",
client_cred="<Azure AD app client-secret>",
authority_url="<Authority URL, default is https://login.microsoftonline.com>",
tenant="<Azure AD tenant_id, default is 'common'>",
user_pname="<Azure AD principal name, in most cases is the email linked to the drive>",
path="<Path to start parsing files from>",
recursive=False,
)
Run via the API
---------------
@ -90,35 +82,29 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"onedrive",
"--client-id", "<Azure AD app client-id>",
"--client-cred", "<Azure AD app client-secret>",
"--authority-url", "<Authority URL, default is https://login.microsoftonline.com>",
"--tenant", "<Azure AD tenant_id, default is 'common'>",
"--user-pname", "<Azure AD principal name, in most cases is the email linked to the drive>",
"--path", "<Path to start parsing files from>",
"--output-dir", "onedrive-ingest-output",
"--num-processes", "2",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.onedrive import onedrive
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
onedrive(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="onedrive-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
client_id="<Azure AD app client-id>",
client_cred="<Azure AD app client-secret>",
authority_url="<Authority URL, default is https://login.microsoftonline.com>",
tenant="<Azure AD tenant_id, default is 'common'>",
user_pname="<Azure AD principal name, in most cases is the email linked to the drive>",
path="<Path to start parsing files from>",
recursive=False,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -33,33 +33,26 @@ Run Locally
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"outlook",
"--client-id", "$MS_CLIENT_ID",
"--client-cred", "$MS_CLIENT_CRED",
"--tenant", "<Azure AD tenant_id, default is 'common'>",
"--user-email", "$MS_USER_EMAIL",
"--outlook-folders", "Inbox,Sent Items",
"--output-dir", "onedrive-ingest-output",
"--num-processes", "2",
"--recursive",
"--verbose",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.outlook import outlook
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
outlook(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="outlook-output",
num_processes=2,
),
client_id=os.getenv("MS_CLIENT_ID"),
client_cred=os.getenv("MS_CLIENT_CRED"),
tenant=os.getenv("MS_TENANT_ID"),
user_email=os.getenv("MS_USER_EMAIL"),
outlook_folders=["Inbox", "Sent Items"],
recursive=True,
)
Run via the API
---------------
@ -86,31 +79,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"airtable",
"--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed",
"--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN",
"--output-dir", "airtable-ingest-output"
"--num-processes", "2",
"--reprocess",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.outlook import outlook
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
outlook(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="outlook-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
client_id=os.getenv("MS_CLIENT_ID"),
client_cred=os.getenv("MS_CLIENT_CRED"),
tenant=os.getenv("MS_TENANT_ID"),
user_email=os.getenv("MS_USER_EMAIL"),
outlook_folders=["Inbox", "Sent Items"],
recursive=True,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -33,33 +33,24 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.reddit import reddit
command = [
"unstructured-ingest",
"reddit",
"--subreddit-name", "machinelearning",
"--client-id", "<client id here>",
"--client-secret", "<client secret here>",
"--user-agent", "Unstructured Ingest Subreddit fetcher by \\u\\...",
"--search-query", "Unstructured",
"--num-posts", "10",
"--output-dir", "reddit-ingest-output",
"--num-processes", "2",
"--verbose"
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
reddit(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="reddit-ingest-output",
num_processes=2,
),
subreddit_name="machinelearning",
client_id="<client id here>",
client_secret="<client secret here>",
user_agent=r"Unstructured Ingest Subreddit fetcher by \\u\...",
search_query="Unstructured",
num_posts=10,
)
Run via the API
---------------
@ -90,35 +81,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"reddit",
"--subreddit-name", "machinelearning",
"--client-id", "<client id here>",
"--client-secret", "<client secret here>",
"--user-agent", "Unstructured Ingest Subreddit fetcher by \\u\\...",
"--search-query", "Unstructured",
"--num-posts", "10",
"--output-dir", "reddit-ingest-output",
"--num-processes", "2",
"--verbose"
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.reddit import reddit
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
reddit(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="reddit-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
subreddit_name="machinelearning",
client_id="<client id here>",
client_secret="<client secret here>",
user_agent=r"Unstructured Ingest Subreddit fetcher by \\u\...",
search_query="Unstructured",
num_posts=10,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -28,28 +28,20 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.s3 import s3
command = [
"unstructured-ingest",
"s3",
"--remote-url", "s3://utic-dev-tech-fixtures/small-pdf-set/",
"--anonymous",
"--output-dir", "s3-small-batch-output",
"--num-processes", "2"
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
s3(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="s3-small-batch-output",
num_processes=2,
),
remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/",
anonymous=True,
)
Run via the API
---------------
@ -75,30 +67,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"s3",
"--remote-url", "s3://utic-dev-tech-fixtures/small-pdf-set/",
"--anonymous",
"--output-dir", "s3-small-batch-output",
"--num-processes", "2",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.s3 import s3
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
s3(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="s3-small-batch-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/",
anonymous=True,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -32,32 +32,25 @@ Run Locally
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"salesforce",
"--username" "$SALESFORCE_USERNAME"
"--consumer-key" "$SALESFORCE_CONSUMER_KEY"
"--private-key-path" "$SALESFORCE_PRIVATE_KEY_PATH"
"--categories" "EmailMessage,Account,Lead,Case,Campaign"
"--output-dir" "salesforce-output"
"--num-processes", "2"
"--recursive",
"--verbose",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.salesforce import salesforce
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
salesforce(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="salesforce-output",
num_processes=2,
),
username=os.getenv("SALESFORCE_USERNAME"),
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"),
categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"],
recursive=True,
)
Run via the API
---------------
@ -87,34 +80,27 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"salesforce",
"--username" "$SALESFORCE_USERNAME"
"--consumer-key" "$SALESFORCE_CONSUMER_KEY"
"--private-key-path" "$SALESFORCE_PRIVATE_KEY_PATH"
"--categories" "EmailMessage,Account,Lead,Case,Campaign"
"--output-dir" "salesforce-output"
"--num-processes", "2"
"--recursive",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.salesforce import salesforce
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
salesforce(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="salesforce-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
username=os.getenv("SALESFORCE_USERNAME"),
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"),
categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"],
recursive=True,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -25,37 +25,32 @@ Run Locally
--files-only "Flag to process only files within the site(s)" \
--output-dir sharepoint-ingest-output \
--num-processes 2 \
--path "Shared Documents" \
--verbose
.. tab:: Python
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.sharepoint import sharepoint
command = [
"unstructured-ingest",
"sharepoint",
"--client-id", "<Microsoft Sharepoint app client-id>",
"--client-cred", "<Microsoft Sharepoint app client-secret>",
"--site", "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>",
"--files-only", "Flag to process only files within the site(s)",
"--output-dir", "sharepoint-ingest-output",
"--num-processes", "2",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
sharepoint(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="sharepoint-ingest-output",
num_processes=2,
),
client_id="<Microsoft Sharepoint app client-id>",
client_cred="<Microsoft Sharepoint app client-secret>",
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
# Flag to process only files within the site(s)
files_only=True,
path="Shared Documents",
recursive=False,
)
Run via the API
---------------
@ -77,6 +72,7 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
--output-dir sharepoint-ingest-output \
--num-processes 2 \
--verbose \
--path "Shared Documents" \
--partition-by-api \
--api-key "<UNSTRUCTURED-API-KEY>"
@ -84,33 +80,29 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"sharepoint",
"--client-id", "<Microsoft Sharepoint app client-id>",
"--client-cred", "<Microsoft Sharepoint app client-secret>",
"--site", "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>",
"--files-only", "Flag to process only files within the site(s)",
"--output-dir", "sharepoint-ingest-output",
"--num-processes", "2",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.sharepoint import sharepoint
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
sharepoint(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="sharepoint-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
client_id="<Microsoft Sharepoint app client-id>",
client_cred="<Microsoft Sharepoint app client-secret>",
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
# Flag to process only files within the site(s)
files_only=True,
path="Shared Documents",
recursive=False,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -30,30 +30,22 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.slack import slack
command = [
"unstructured-ingest",
"slack",
"--channels", "12345678",
"--token", "12345678",
"--download-dir", "slack-ingest-download",
"--output-dir", "slack-ingest-output",
"--start-date", "2023-04-01T01:00:00-08:00",
"--end-date", "2023-04-02"
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
slack(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="slack-ingest-download",
num_processes=2,
),
channels=["12345678"],
token="12345678",
start_date="2023-04-01T01:00:00-08:00",
end_date="2023-04-02,",
)
Run via the API
---------------
@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"slack",
"--channels", "12345678",
"--token", "12345678",
"--download-dir", "slack-ingest-download",
"--output-dir", "slack-ingest-output",
"--start-date", "2023-04-01T01:00:00-08:00",
"--end-date", "2023-04-02",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.slack import slack
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
slack(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="slack-ingest-download",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
channels=["12345678"],
token="12345678",
start_date="2023-04-01T01:00:00-08:00",
end_date="2023-04-02,",
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -28,28 +28,21 @@ Run Locally
.. code:: python
import subprocess
from unstructured.ingest.runner.wikipedia import wikipedia
from unstructured.ingest.interfaces import ReadConfig, PartitionConfig
command = [
"unstructured-ingest",
"wikipedia",
"--page-title", "Open Source Software",
"--output-dir", "dropbox-output",
"--num-processes", "2",
"--verbose",
]
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
wikipedia(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="wikipedia-ingest-output",
num_processes=2
),
page_title="Open Source Software",
auto_suggest=False,
)
Run via the API
---------------
@ -75,30 +68,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
.. code:: python
import subprocess
import os
command = [
"unstructured-ingest",
"wikipedia",
"--page-title", "Open Source Software",
"--output-dir", "dropbox-output",
"--num-processes", "2",
"--verbose",
"--partition-by-api",
"--api-key", "<UNSTRUCTURED-API-KEY>",
]
from unstructured.ingest.interfaces import PartitionConfig, ReadConfig
from unstructured.ingest.runner.wikipedia import wikipedia
# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE)
output, error = process.communicate()
# Print output
if process.returncode == 0:
print('Command executed successfully. Output:')
print(output.decode())
else:
print('Command failed. Error:')
print(error.decode())
if __name__ == "__main__":
wikipedia(
verbose=True,
read_config=ReadConfig(),
partition_config=PartitionConfig(
output_dir="wikipedia-ingest-output",
num_processes=2,
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
),
page_title="Open Source Software",
auto_suggest=False,
)
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.

View File

@ -25,4 +25,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--files-only "Flag to process only files within the site(s)" \
--output-dir sharepoint-ingest-output \
--num-processes 2 \
--path "Shared Documents" \
--verbose

View File

@ -1 +1 @@
__version__ = "0.10.19-dev4" # pragma: no cover
__version__ = "0.10.19-dev5" # pragma: no cover

View File

@ -10,11 +10,11 @@ from unstructured.ingest.runner.writers import writer_map
def airtable(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
personal_access_token: str,
list_of_paths: t.Optional[str],
verbose: bool = False,
list_of_paths: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -9,14 +9,14 @@ from unstructured.ingest.runner.writers import writer_map
def azure(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
account_name: t.Optional[str],
account_key: t.Optional[str],
connection_string: t.Optional[str],
remote_url: str,
recursive: bool,
verbose: bool = False,
recursive: bool = False,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -13,13 +13,13 @@ def biomed(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
path: t.Optional[str],
api_id: t.Optional[str],
api_from: t.Optional[str],
api_until: t.Optional[str],
max_retries: int,
max_request_time: int,
decay: float,
path: t.Optional[str] = None,
api_id: t.Optional[str] = None,
api_from: t.Optional[str] = None,
api_until: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
def box(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
remote_url: str,
recursive: bool,
box_app_config: t.Optional[str],
verbose: bool = False,
recursive: bool = False,
box_app_config: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,7 +10,6 @@ from unstructured.ingest.runner.writers import writer_map
def confluence(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
url: str,
@ -18,6 +17,7 @@ def confluence(
api_token: str,
max_num_of_spaces: int,
max_num_of_docs_from_each_space: int,
verbose: bool = False,
spaces: t.Optional[t.List[str]] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,

View File

@ -11,12 +11,12 @@ from unstructured.ingest.runner.writers import writer_map
def delta_table(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
table_uri: t.Union[str, Path],
version: t.Optional[int] = None,
storage_options: t.Optional[str] = None,
verbose: bool = False,
without_files: bool = False,
columns: t.Optional[t.List[str]] = None,
writer_type: t.Optional[str] = None,

View File

@ -10,12 +10,12 @@ from unstructured.ingest.runner.writers import writer_map
def discord(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
channels: t.List[str],
token: str,
period: t.Optional[int],
verbose: bool = False,
period: t.Optional[int] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
def dropbox(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
remote_url: str,
recursive: bool,
token: t.Optional[str],
verbose: bool = False,
recursive: bool = False,
token: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,12 +10,12 @@ from unstructured.ingest.runner.writers import writer_map
def elasticsearch(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
url: str,
index_name: str,
jq_query: t.Optional[str],
verbose: bool = False,
jq_query: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -11,11 +11,11 @@ from unstructured.ingest.runner.writers import writer_map
def fsspec(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
remote_url: str,
recursive: bool,
verbose: bool = False,
recursive: bool = False,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
def gcs(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
remote_url: str,
recursive: bool,
token: t.Optional[str],
verbose: bool = False,
recursive: bool = False,
token: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
def github(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
url: str,
git_branch: str,
git_access_token: t.Optional[str],
git_file_glob: t.Optional[str],
verbose: bool = False,
git_access_token: t.Optional[str] = None,
git_file_glob: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
def gitlab(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
url: str,
git_branch: str,
git_access_token: t.Optional[str],
git_file_glob: t.Optional[str],
verbose: bool = False,
git_access_token: t.Optional[str] = None,
git_file_glob: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
def gdrive(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
service_account_key: str,
recursive: bool,
drive_id: str,
extension: t.Optional[str],
verbose: bool = False,
recursive: bool = False,
extension: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,15 +10,15 @@ from unstructured.ingest.runner.writers import writer_map
def jira(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
url: str,
user_email: str,
api_token: str,
projects: t.Optional[t.List[str]],
boards: t.Optional[t.List[str]],
issues: t.Optional[t.List[str]],
verbose: bool = False,
projects: t.Optional[t.List[str]] = None,
boards: t.Optional[t.List[str]] = None,
issues: t.Optional[t.List[str]] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -8,12 +8,12 @@ from unstructured.ingest.runner.writers import writer_map
def local(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
input_path: str,
recursive: bool,
file_glob: t.Optional[str],
verbose: bool = False,
recursive: bool = False,
file_glob: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -11,11 +11,11 @@ from unstructured.ingest.runner.writers import writer_map
def notion(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
api_key: str,
recursive: bool,
verbose: bool = False,
recursive: bool = False,
page_ids: t.Optional[t.List[str]] = None,
database_ids: t.Optional[t.List[str]] = None,
writer_type: t.Optional[str] = None,

View File

@ -10,16 +10,16 @@ from unstructured.ingest.runner.writers import writer_map
def onedrive(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
tenant: str,
user_pname: str,
client_id: str,
client_cred: str,
authority_url: t.Optional[str],
path: t.Optional[str],
recursive: bool,
verbose: bool = False,
authority_url: t.Optional[str] = None,
path: t.Optional[str] = None,
recursive: bool = False,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,15 +10,15 @@ from unstructured.ingest.runner.writers import writer_map
def outlook(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
user_email: str,
client_id: t.Optional[str],
client_cred: t.Optional[str],
tenant: t.Optional[str],
authority_url: t.Optional[str],
recursive: bool,
verbose: bool = False,
recursive: bool = False,
client_id: t.Optional[str] = None,
client_cred: t.Optional[str] = None,
tenant: t.Optional[str] = None,
authority_url: t.Optional[str] = None,
outlook_folders: t.Optional[t.List[str]] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,

View File

@ -10,15 +10,15 @@ from unstructured.ingest.runner.writers import writer_map
def reddit(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
subreddit_name: str,
client_id: t.Optional[str],
client_secret: t.Optional[str],
user_agent: str,
search_query: t.Optional[str],
num_posts: int,
verbose: bool = False,
client_id: t.Optional[str] = None,
client_secret: t.Optional[str] = None,
search_query: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -9,12 +9,12 @@ from unstructured.ingest.runner.writers import writer_map
def s3(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
remote_url: str,
recursive: bool,
anonymous: bool,
verbose: bool = False,
recursive: bool = False,
anonymous: bool = False,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,14 +10,14 @@ from unstructured.ingest.runner.writers import writer_map
def salesforce(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
recursive: bool,
username: str,
consumer_key: str,
private_key_path: str,
categories: t.List[str],
verbose: bool = False,
recursive: bool = False,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -14,9 +14,9 @@ class SharePoint(Runner):
site: str,
client_id: str,
client_cred: str,
files_only: bool,
path: str,
recursive: bool,
files_only: bool = False,
recursive: bool = False,
**kwargs,
):
writer_kwargs = self.writer_kwargs if self.writer_kwargs else {}

View File

@ -10,13 +10,13 @@ from unstructured.ingest.runner.writers import writer_map
def slack(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
channels: t.List[str],
token: str,
start_date: t.Optional[str],
end_date: t.Optional[str],
verbose: bool = False,
start_date: t.Optional[str] = None,
end_date: t.Optional[str] = None,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,

View File

@ -10,11 +10,11 @@ from unstructured.ingest.runner.writers import writer_map
def wikipedia(
verbose: bool,
read_config: ReadConfig,
partition_config: PartitionConfig,
page_title: str,
auto_suggest: bool,
verbose: bool = False,
auto_suggest: bool = False,
writer_type: t.Optional[str] = None,
writer_kwargs: t.Optional[dict] = None,
**kwargs,