mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

* add the first version of airtable connector * change imports as inline to fail gracefully in case of lacking dependency * parse tables as csv rather than plain text * add relevant logic to be able to use --airtable-list-of-paths * add script for creation of reseources for testing, add test script (large) for testing with a large number of tables to validate scroll functionality, update test script (diff) based on the new settings * fix ingest test names * add scripts for the large table test * remove large table test from diff test * make base and table ids explicit * add and remove comments * use -ne instead of != * update code based on the recent ingest refactor, update changelog and version * shellcheck fix * update comments * update check-num-rows-and-columns-output error message Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> * update help comments * update help comments * update help comments * update workflows to set auth tokens and to run make install * add comments on create_scale_test_components * separate component ids from the test script, add comments to document test component creation * add LARGE_BASE test, implement LARGE_BASE component creation, replace component id * shellcheck fixes * shellcheck fixes * update docs * update comment * bump version * add wrongly deleted file * sort columns before saving to process * Update ingest test fixtures (#1098) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
83 lines
3.3 KiB
Python
83 lines
3.3 KiB
Python
import os
|
|
|
|
# import pyairtable as pyair
|
|
from pyairtable import Api
|
|
|
|
from unstructured.ingest.logger import logger
|
|
|
|
SCALE_TEST_NUMBER_OF_RECORDS = 20_000
|
|
|
|
# Access token that has read and write permissions for the respective workspace
|
|
token = os.environ["AIRTABLE_ACCESS_TOKEN_WRITE"]
|
|
|
|
# You can find the IDs below defined in component_ids.sh
|
|
# In case new ones are needed to be created, there's guidance below and in component_ids.sh.
|
|
|
|
# ID of a new base that is intended to contain one large table.
|
|
# The table will be filled using this python script.
|
|
# If the ID is not in the environment, it is possible to create a new base
|
|
# via the Airtable UI, and get the base ID from the URL structure.
|
|
# (https://support.airtable.com/docs/finding-airtable-ids)
|
|
large_table_base_id = os.environ["LARGE_TABLE_BASE_ID"]
|
|
|
|
# ID of the one table inside the base "large_table_base".
|
|
# The table is intended to be large, and will be filled using this python script.
|
|
# If the ID is not in the environment, it is possible to create a new table
|
|
# via the Airtable UI, and get the table ID from the URL structure.
|
|
# (https://support.airtable.com/docs/finding-airtable-ids)
|
|
large_table_table_id = os.environ["LARGE_TABLE_TABLE_ID"]
|
|
|
|
# ID of a base that is intended to contain lots of tables.
|
|
# large_base_base_id = os.environ["LARGE_BASE_BASE_ID"]
|
|
# Creating tables is not yet supported in pyairtable. Try Airtable Web API instead:
|
|
# https://airtable.com/developers/web/api/create-base"
|
|
|
|
|
|
def create_n_bases(api, number_of_bases):
|
|
raise NotImplementedError(
|
|
"Creating bases is not yet supported in pyairtable. \
|
|
Try Airtable Web API instead: \
|
|
https://airtable.com/developers/web/api/create-base",
|
|
)
|
|
# if len(pyair.metadata.get_api_bases(api)["bases"])>99:
|
|
# logger.warning("Airtable Org already has a high number of bases. \
|
|
# Aborting creation of new bases to avoid duplication and bloating.")
|
|
# return
|
|
|
|
number_of_bases
|
|
|
|
|
|
def create_n_tables(base, number_of_tables):
|
|
raise NotImplementedError(
|
|
"Creating tables is not yet supported in pyairtable. \
|
|
Try Airtable Web API instead: \
|
|
https://airtable.com/developers/web/api/create-table",
|
|
)
|
|
# if len(pyair.metadata.get_base_schema(base)["tables"])>99:
|
|
# logger.warning("Base already has a high number of tables. \
|
|
# Aborting creation of new tables to avoid duplication and bloating.")
|
|
# return
|
|
|
|
|
|
def create_n_records(table, number_of_records):
|
|
logger.warning(
|
|
"Fetching table records to count, before creation of new records.\
|
|
This should take around 1 second per 415 records.",
|
|
)
|
|
if len(table.all()) > SCALE_TEST_NUMBER_OF_RECORDS - 1:
|
|
logger.warning(
|
|
"Table already has a high number of records. \
|
|
Aborting creation of new records to avoid duplication and bloating.",
|
|
)
|
|
return
|
|
|
|
records = [{"Name": f"My Name is {i}"} for i in range(number_of_records)]
|
|
table.batch_create(records)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
api = Api(token)
|
|
large_table = api.table(large_table_base_id, large_table_table_id)
|
|
logger.info("Creating records, this should take about 1 second per 40 records.")
|
|
create_n_records(large_table, SCALE_TEST_NUMBER_OF_RECORDS)
|