mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	feat: Create spacy notebook example (#593)
* add new notebook for spacy
This commit is contained in:
		
							parent
							
								
									7eac1f8ca7
								
							
						
					
					
						commit
						34d563c1fc
					
				
							
								
								
									
										
											BIN
										
									
								
								example-docs/fake-memo.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								example-docs/fake-memo.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										17
									
								
								examples/spacy/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								examples/spacy/README.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,17 @@
 | 
			
		||||
# Loading `unstructured` outputs into Spacy
 | 
			
		||||
 | 
			
		||||
The following example shows how to load `unstructured` outputs into Spacy.
 | 
			
		||||
This allows you to perform NLP to find important data from outputs the `unstructured`
 | 
			
		||||
library has extracted.
 | 
			
		||||
Follow the instructions [here](https://spacy.io/usage)
 | 
			
		||||
to install Spacy on your system.
 | 
			
		||||
 | 
			
		||||
Once you have installed MySQL, you can connect to MySQL with the command `mysql -u root`.
 | 
			
		||||
You can create a non-root user and an `unstructured_example` database using the following
 | 
			
		||||
commands:
 | 
			
		||||
 | 
			
		||||
## Running the example
 | 
			
		||||
 | 
			
		||||
1. Run `pip install -r requirements.txt` to install the Python dependencies.
 | 
			
		||||
1. Run `jupyter-notebook to start.
 | 
			
		||||
1. Run the `load-into-spacy.ipynb` notebook.
 | 
			
		||||
							
								
								
									
										249
									
								
								examples/spacy/load-into-spacy.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										249
									
								
								examples/spacy/load-into-spacy.ipynb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,249 @@
 | 
			
		||||
{
 | 
			
		||||
 "cells": [
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "id": "2fac3543",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "# Loading Data into Spacy"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "id": "30bc0a1b",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "The goal of this notebook is to show you how to start a spacy project with Unstructured's Elements. This allows you to create your NLP projects.\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "Make sure you have Spacy installed on your local computer before running this notebook. If not, you can find the instructions for installation [here](https://spacy.io/usage)."
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "id": "ac83c096",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "# Preprocess Documents with Unstructured"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "id": "a29ef57d",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "First, we'll pre-process a few documents using the the `unstructured` libraries. The example documents are available under the `example-docs` directory in the `unstructured` repo. At the end of this section, we'll wind up with a list of `Element` objects that we can pass into an `unstructured` staging brick."
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 3,
 | 
			
		||||
   "id": "adb6b8f7",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "import os\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "from unstructured.partition.auto import partition"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 8,
 | 
			
		||||
   "id": "8464299b",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "# NOTE: Update this directory if you are running the notebook\n",
 | 
			
		||||
    "# from somewhere other than the examples/spacy folder in the\n",
 | 
			
		||||
    "# unstructured repo\n",
 | 
			
		||||
    "EXAMPLE_DOCS_FOLDER = \"../../example-docs/\""
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 9,
 | 
			
		||||
   "id": "2fd24424",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "document_to_process = \"fake-memo.pdf\"\n",
 | 
			
		||||
    "filename = os.path.join(EXAMPLE_DOCS_FOLDER, document_to_process)\n",
 | 
			
		||||
    "elements = partition(filename=filename, strategy=\"fast\")"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 10,
 | 
			
		||||
   "id": "0aa45e81",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "'May 5, 2023'"
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 10,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "elements[0].text"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 11,
 | 
			
		||||
   "id": "2429f8a5",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "{'filename': 'fake-memo.pdf',\n",
 | 
			
		||||
       " 'file_directory': '../../example-docs',\n",
 | 
			
		||||
       " 'filetype': 'application/pdf',\n",
 | 
			
		||||
       " 'page_number': 1}"
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 11,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "elements[0].metadata.to_dict()"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "id": "1fd556ff",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "# Extract Numbers Using Spacy\n"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "id": "bdf2cefe",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "Now let's import `spacy` and create a function to extract noun phrases with numbers. First we'll use a simple example then we'll use the text extracted by `unstructured`.\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "The function first creates a spacy object with the text, then iterates through the spacy object to find the noun phrases with numbers. It then formats the phrases and appends to a list."
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 1,
 | 
			
		||||
   "id": "bfd20f75",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "Number: 10, Noun: apples, Context: 10 apples\n",
 | 
			
		||||
      "Number: 5, Noun: oranges, Context: 5 oranges\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "import spacy\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "nlp = spacy.load(\"en_core_web_sm\")\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "def extract_numbers_with_context(text):\n",
 | 
			
		||||
    "    doc = nlp(text)\n",
 | 
			
		||||
    "    numbers = []\n",
 | 
			
		||||
    "    \n",
 | 
			
		||||
    "    for token in doc:\n",
 | 
			
		||||
    "        if token.like_num and token.dep_ == 'nummod' and token.head.pos_ == 'NOUN':\n",
 | 
			
		||||
    "            number = token.text\n",
 | 
			
		||||
    "            noun = token.head.text\n",
 | 
			
		||||
    "            context = ' '.join([number, noun])\n",
 | 
			
		||||
    "            numbers.append((number, noun, context))\n",
 | 
			
		||||
    "    \n",
 | 
			
		||||
    "    return numbers\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "# Example usage\n",
 | 
			
		||||
    "text = \"I bought 10 apples and 5 oranges yesterday.\"\n",
 | 
			
		||||
    "numbers_with_context = extract_numbers_with_context(text)\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "for number, noun, context in numbers_with_context:\n",
 | 
			
		||||
    "    print(f\"Number: {number}, Noun: {noun}, Context: {context}\")"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "id": "7eae9735",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "### Using the Data Extracted with Unstructured's Library"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 28,
 | 
			
		||||
   "id": "7c738f91",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "numbers_with_context = extract_numbers_with_context(elements[2].text)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 29,
 | 
			
		||||
   "id": "3459555b",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "Number: 20,000, Noun: bottles, Context: 20,000 bottles\n",
 | 
			
		||||
      "Number: 10,000, Noun: blankets, Context: 10,000 blankets\n",
 | 
			
		||||
      "Number: 200, Noun: laptops, Context: 200 laptops\n",
 | 
			
		||||
      "Number: 3, Noun: trucks, Context: 3 trucks\n",
 | 
			
		||||
      "Number: 15, Noun: hours, Context: 15 hours\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "for number, noun, context in numbers_with_context:\n",
 | 
			
		||||
    "    print(f\"Number: {number}, Noun: {noun}, Context: {context}\")"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": null,
 | 
			
		||||
   "id": "dadd055a",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": []
 | 
			
		||||
  }
 | 
			
		||||
 ],
 | 
			
		||||
 "metadata": {
 | 
			
		||||
  "kernelspec": {
 | 
			
		||||
   "display_name": "Python 3 (ipykernel)",
 | 
			
		||||
   "language": "python",
 | 
			
		||||
   "name": "python3"
 | 
			
		||||
  },
 | 
			
		||||
  "language_info": {
 | 
			
		||||
   "codemirror_mode": {
 | 
			
		||||
    "name": "ipython",
 | 
			
		||||
    "version": 3
 | 
			
		||||
   },
 | 
			
		||||
   "file_extension": ".py",
 | 
			
		||||
   "mimetype": "text/x-python",
 | 
			
		||||
   "name": "python",
 | 
			
		||||
   "nbconvert_exporter": "python",
 | 
			
		||||
   "pygments_lexer": "ipython3",
 | 
			
		||||
   "version": "3.8.15"
 | 
			
		||||
  }
 | 
			
		||||
 },
 | 
			
		||||
 "nbformat": 4,
 | 
			
		||||
 "nbformat_minor": 5
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										2
									
								
								examples/spacy/requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								examples/spacy/requirements.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,2 @@
 | 
			
		||||
unstructured[local-inference]
 | 
			
		||||
spacy
 | 
			
		||||
@ -16,8 +16,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
			
		||||
 | 
			
		||||
set +e
 | 
			
		||||
 | 
			
		||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 4 ]; then
 | 
			
		||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 5 ]; then
 | 
			
		||||
   echo
 | 
			
		||||
   echo "4 files should have been created."
 | 
			
		||||
   echo "5 files should have been created."
 | 
			
		||||
   exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user