2022-06-29 14:35:19 -04:00
|
|
|
"""
|
|
|
|
setup.py
|
|
|
|
|
|
|
|
unstructured - pre-processing tools for unstructured data
|
|
|
|
|
|
|
|
Copyright 2022 Unstructured Technologies, Inc.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
"""
|
2023-05-24 17:29:35 -05:00
|
|
|
from typing import List, Optional, Union
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
from setuptools import find_packages, setup
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
from unstructured.__version__ import __version__
|
|
|
|
|
2023-05-24 17:29:35 -05:00
|
|
|
|
|
|
|
def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List[str]:
|
|
|
|
if file_list is None:
|
|
|
|
file_list = ["requirements/base.in"]
|
|
|
|
if isinstance(file_list, str):
|
|
|
|
file_list = [file_list]
|
|
|
|
requirements: List[str] = []
|
|
|
|
for file in file_list:
|
|
|
|
with open(file, encoding="utf-8") as f:
|
|
|
|
requirements.extend(f.readlines())
|
|
|
|
requirements = [
|
|
|
|
req for req in requirements if not req.startswith("#") and not req.startswith("-")
|
|
|
|
]
|
|
|
|
return requirements
|
|
|
|
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
setup(
|
|
|
|
name="unstructured",
|
|
|
|
description="A library that prepares raw documents for downstream ML tasks.",
|
2023-02-27 17:30:54 +01:00
|
|
|
long_description=open("README.md", encoding="utf-8").read(), # noqa: SIM115
|
2022-11-08 16:55:41 -05:00
|
|
|
long_description_content_type="text/markdown",
|
2022-11-08 15:22:43 -06:00
|
|
|
keywords="NLP PDF HTML CV XML parsing preprocessing",
|
|
|
|
url="https://github.com/Unstructured-IO/unstructured",
|
2022-11-11 12:15:23 -05:00
|
|
|
python_requires=">=3.7.0",
|
2022-11-08 15:22:43 -06:00
|
|
|
classifiers=[
|
|
|
|
"Development Status :: 4 - Beta",
|
|
|
|
"Intended Audience :: Developers",
|
|
|
|
"Intended Audience :: Education",
|
|
|
|
"Intended Audience :: Science/Research",
|
|
|
|
"License :: OSI Approved :: Apache Software License",
|
|
|
|
"Operating System :: OS Independent",
|
|
|
|
"Programming Language :: Python :: 3",
|
|
|
|
"Programming Language :: Python :: 3.8",
|
|
|
|
"Programming Language :: Python :: 3.9",
|
2023-05-31 13:50:15 -05:00
|
|
|
"Programming Language :: Python :: 3.10",
|
2022-11-08 15:22:43 -06:00
|
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
|
|
],
|
2022-06-29 14:35:19 -04:00
|
|
|
author="Unstructured Technologies",
|
|
|
|
author_email="devops@unstructuredai.io",
|
|
|
|
license="Apache-2.0",
|
|
|
|
packages=find_packages(),
|
|
|
|
version=__version__,
|
2023-02-21 10:15:33 -08:00
|
|
|
entry_points={
|
2023-03-08 09:15:21 +01:00
|
|
|
"console_scripts": ["unstructured-ingest=unstructured.ingest.main:main"],
|
2023-02-21 10:15:33 -08:00
|
|
|
},
|
2023-05-24 17:29:35 -05:00
|
|
|
install_requires=load_requirements(),
|
2022-10-13 11:18:27 -04:00
|
|
|
extras_require={
|
2023-05-24 17:29:35 -05:00
|
|
|
"huggingface": load_requirements("requirements/huggingface.in"),
|
|
|
|
"local-inference": load_requirements("requirements/local-inference.in"),
|
|
|
|
"s3": load_requirements("requirements/ingest-s3.in"),
|
|
|
|
"azure": load_requirements("requirements/ingest-azure.in"),
|
|
|
|
"discord": load_requirements("requirements/ingest-discord.in"),
|
|
|
|
"github": load_requirements("requirements/ingest-github.in"),
|
|
|
|
"gitlab": load_requirements("requirements/ingest-gitlab.in"),
|
|
|
|
"reddit": load_requirements("requirements/ingest-reddit.in"),
|
|
|
|
"slack": load_requirements("requirements/ingest-slack.in"),
|
|
|
|
"wikipedia": load_requirements("requirements/ingest-wikipedia.in"),
|
|
|
|
"google-drive": load_requirements("requirements/ingest-google-drive.in"),
|
2023-06-21 15:14:50 -07:00
|
|
|
"gcs": load_requirements("requirements/ingest-gcs.in"),
|
2023-07-01 18:45:28 +01:00
|
|
|
"elasticsearch": load_requirements("requirements/ingest-elasticsearch.in"),
|
2023-06-30 17:08:27 -07:00
|
|
|
"dropbox": load_requirements("requirements/ingest-dropbox.in"),
|
2022-10-13 11:18:27 -04:00
|
|
|
},
|
2023-02-02 12:25:47 -05:00
|
|
|
package_dir={"unstructured": "unstructured"},
|
|
|
|
package_data={"unstructured": ["nlp/*.txt"]},
|
2022-06-29 14:35:19 -04:00
|
|
|
)
|