unstructured/examples/arxiv-topic-modelling/topic_model.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import General Use Packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import arxiv  # Interact with arXiv api to scrape papers\n",
    "from sentence_transformers import (\n",
    "    SentenceTransformer,\n",
    ")  # Use Hugging Face Embedding for Topic Modelling\n",
    "from bertopic import BERTopic  # Package for Topic Modelling\n",
    "from tqdm import tqdm  # Progress Bar When Iterating\n",
    "import glob  # Identify Files in Directory\n",
    "import os  # Delete Files in Directory\n",
    "import pandas as pd  # Dataframe Manipulation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unstructured.partition.auto import partition  # Base Function to Partition PDF\n",
    "from unstructured.staging.base import (\n",
    "    convert_to_dict,\n",
    ")  # Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n",
    "from unstructured.cleaners.core import (\n",
    "    clean,\n",
    "    remove_punctuation,\n",
    "    clean_non_ascii_chars,\n",
    ")  # Cleaning Functions\n",
    "import re  # Create Custom Cleaning Function\n",
    "import nltk  # Toolkit for more advanced pre-processing\n",
    "from nltk.corpus import stopwords  # list of stopwords to remove\n",
    "from typing import List  # Type Hinting"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup NLTK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/pravinsanthanam/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download(\"stopwords\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Function to Extract PDFs About Machine Learning from arXiv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n",
    "    \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n",
    "\n",
    "    Args:\n",
    "        query (str): query for arXiv API\n",
    "        max_results (int, optional): Number of Papers to get back. Defaults to 100.\n",
    "\n",
    "    Returns:\n",
    "        paper_texts (list[str]): Return list of narrative texts for each paper\n",
    "    \"\"\"\n",
    "    # Get List of Arxiv Papers Matching Our Query\n",
    "    arxiv_papers = list(\n",
    "        arxiv.Search(\n",
    "            query=query,\n",
    "            max_results=max_results,\n",
    "            sort_by=arxiv.SortCriterion.Relevance,\n",
    "            sort_order=arxiv.SortOrder.Descending,\n",
    "        ).results()\n",
    "    )\n",
    "\n",
    "    # Loop Through PDFs, Download and Pre-Process and Then Delete\n",
    "    paper_texts = []\n",
    "    for paper in tqdm(arxiv_papers):\n",
    "        paper.download_pdf()\n",
    "        pdf_file = glob.glob(\"*.pdf\")[0]\n",
    "        elements = partition(pdf_file)  # Partition PDF Using Unstructured\n",
    "        isd = convert_to_dict(elements)  # Convert List of Elements to List of Dictionaries\n",
    "        narrative_texts = [\n",
    "            element[\"text\"] for element in isd if element[\"type\"] == \"NarrativeText\"\n",
    "        ]  # Only Keep Narrative Text and Combine Into One String\n",
    "        os.remove(pdf_file)  # Delete PDF\n",
    "        paper_texts += narrative_texts\n",
    "    return paper_texts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n"
     ]
    }
   ],
   "source": [
    "paper_texts = get_arxiv_paper_texts(query=\"natural language processing\", max_results=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Narrative Texts Through Custom Cleaner Function Using Unstructured"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Narrative Texts to Run Through Topic Modelling: 1711\n"
     ]
    }
   ],
   "source": [
    "# Stopwords to Remove\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "\n",
    "\n",
    "# Function to Apply Whatever Cleaning Functionality to Each Narrative Text Element\n",
    "def custom_clean_function(narrative_text: str) -> str:\n",
    "    \"\"\"Apply Mix of Unstructured Cleaning Functions With Some Custom Functionality to Pre-Process Narrative Text\n",
    "\n",
    "    Args:\n",
    "        narrative_text (str): Narrative Text or Any Other Sentence\n",
    "\n",
    "    Returns:\n",
    "        cleaned_text (str): Text after going through all the cleaning procedures\n",
    "    \"\"\"\n",
    "    remove_numbers = lambda text: re.sub(\n",
    "        r\"\\d+\", \"\", text\n",
    "    )  # lambda function to remove all punctuation\n",
    "    cleaned_text = remove_numbers(narrative_text)  # Apply Custom Lambda\n",
    "    cleaned_text = clean(\n",
    "        cleaned_text,\n",
    "        extra_whitespace=True,\n",
    "        dashes=True,\n",
    "        bullets=True,\n",
    "        trailing_punctuation=True,\n",
    "        lowercase=True,\n",
    "    )  # Apply Basic Clean Function With All the Options\n",
    "    cleaned_text = remove_punctuation(cleaned_text)  # Remove all punctuation\n",
    "    cleaned_text = \" \".join(\n",
    "        [word for word in cleaned_text.split() if word not in stop_words]\n",
    "    )  # remove stop words\n",
    "    return cleaned_text\n",
    "\n",
    "\n",
    "# Apply Function to Paper Texts\n",
    "cleaned_paper_texts = [custom_clean_function(text) for text in paper_texts]\n",
    "\n",
    "# Count Narratve Texts\n",
    "print(\n",
    "    \"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts))\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Choose Which Hugging Face Model You Want to Use\n",
    "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
    "\n",
    "# Initialize Model\n",
    "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a6ebe3cb185049bd8d37742f2451cbe0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Batches:   0%|          | 0/54 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n",
      "2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n",
      "2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n",
      "2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n"
     ]
    }
   ],
   "source": [
    "# Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n",
    "topic_model.fit(cleaned_paper_texts)\n",
    "\n",
    "# Store Document-Topic Info\n",
    "doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n",
    "\n",
    "# Store Topic Info\n",
    "topic_info = pd.DataFrame(topic_model.get_topics())\n",
    "topic_info = topic_info.applymap(lambda x: x[0])\n",
    "topic_info.columns = [\"topic_{}\".format(col + 1) for col in topic_info.columns]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Checkout Keywords for Each Topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>topic_0</th>\n",
       "      <th>topic_1</th>\n",
       "      <th>topic_2</th>\n",
       "      <th>topic_3</th>\n",
       "      <th>topic_4</th>\n",
       "      <th>topic_5</th>\n",
       "      <th>topic_6</th>\n",
       "      <th>topic_7</th>\n",
       "      <th>topic_8</th>\n",
       "      <th>topic_9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>neural</td>\n",
       "      <td>language</td>\n",
       "      <td>state</td>\n",
       "      <td>function</td>\n",
       "      <td>cost</td>\n",
       "      <td>publication</td>\n",
       "      <td>graph</td>\n",
       "      <td>llama</td>\n",
       "      <td>tangkhul</td>\n",
       "      <td>want</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>network</td>\n",
       "      <td>natural</td>\n",
       "      <td>rnn</td>\n",
       "      <td>distribution</td>\n",
       "      <td>function</td>\n",
       "      <td>april</td>\n",
       "      <td>computation</td>\n",
       "      <td>like</td>\n",
       "      <td>compound</td>\n",
       "      <td>edu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>function</td>\n",
       "      <td>model</td>\n",
       "      <td>memory</td>\n",
       "      <td>output</td>\n",
       "      <td>sgd</td>\n",
       "      <td>syst</td>\n",
       "      <td>node</td>\n",
       "      <td>south</td>\n",
       "      <td>root</td>\n",
       "      <td>dsontagcoursesinferenceslidespseudolikelihoodn...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>networks</td>\n",
       "      <td>word</td>\n",
       "      <td>vector</td>\n",
       "      <td>class</td>\n",
       "      <td>training</td>\n",
       "      <td>technol</td>\n",
       "      <td>nodes</td>\n",
       "      <td>animal</td>\n",
       "      <td>morphological</td>\n",
       "      <td>regardlessly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>one</td>\n",
       "      <td>planning</td>\n",
       "      <td>input</td>\n",
       "      <td>tanh</td>\n",
       "      <td>expected</td>\n",
       "      <td>date</td>\n",
       "      <td>backward</td>\n",
       "      <td>america</td>\n",
       "      <td>verbs</td>\n",
       "      <td>satisfied</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>input</td>\n",
       "      <td>words</td>\n",
       "      <td>network</td>\n",
       "      <td>data</td>\n",
       "      <td>optimization</td>\n",
       "      <td>vol</td>\n",
       "      <td>function</td>\n",
       "      <td>translation</td>\n",
       "      <td>noun</td>\n",
       "      <td>november</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>vector</td>\n",
       "      <td>based</td>\n",
       "      <td>recurrent</td>\n",
       "      <td>yˆ</td>\n",
       "      <td>algorithm</td>\n",
       "      <td>intell</td>\n",
       "      <td>backpropagation</td>\n",
       "      <td>french</td>\n",
       "      <td>roots</td>\n",
       "      <td>tune</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>language</td>\n",
       "      <td>processing</td>\n",
       "      <td>sequence</td>\n",
       "      <td>loss</td>\n",
       "      <td>set</td>\n",
       "      <td>acm</td>\n",
       "      <td>algorithm</td>\n",
       "      <td>cute</td>\n",
       "      <td>adjectives</td>\n",
       "      <td>return</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>model</td>\n",
       "      <td>models</td>\n",
       "      <td>neural</td>\n",
       "      <td>activation</td>\n",
       "      <td>validation</td>\n",
       "      <td>article</td>\n",
       "      <td>parameters</td>\n",
       "      <td>google</td>\n",
       "      <td>formation</td>\n",
       "      <td>fully</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>training</td>\n",
       "      <td>data</td>\n",
       "      <td>lstm</td>\n",
       "      <td>softmax</td>\n",
       "      <td>rate</td>\n",
       "      <td>trans</td>\n",
       "      <td>output</td>\n",
       "      <td>domesticated</td>\n",
       "      <td>language</td>\n",
       "      <td>results</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    topic_0     topic_1    topic_2       topic_3       topic_4      topic_5  \\\n",
       "0    neural    language      state      function          cost  publication   \n",
       "1   network     natural        rnn  distribution      function        april   \n",
       "2  function       model     memory        output           sgd         syst   \n",
       "3  networks        word     vector         class      training      technol   \n",
       "4       one    planning      input          tanh      expected         date   \n",
       "5     input       words    network          data  optimization          vol   \n",
       "6    vector       based  recurrent            yˆ     algorithm       intell   \n",
       "7  language  processing   sequence          loss           set          acm   \n",
       "8     model      models     neural    activation    validation      article   \n",
       "9  training        data       lstm       softmax          rate        trans   \n",
       "\n",
       "           topic_6       topic_7        topic_8  \\\n",
       "0            graph         llama       tangkhul   \n",
       "1      computation          like       compound   \n",
       "2             node         south           root   \n",
       "3            nodes        animal  morphological   \n",
       "4         backward       america          verbs   \n",
       "5         function   translation           noun   \n",
       "6  backpropagation        french          roots   \n",
       "7        algorithm          cute     adjectives   \n",
       "8       parameters        google      formation   \n",
       "9           output  domesticated       language   \n",
       "\n",
       "                                             topic_9  \n",
       "0                                               want  \n",
       "1                                                edu  \n",
       "2  dsontagcoursesinferenceslidespseudolikelihoodn...  \n",
       "3                                       regardlessly  \n",
       "4                                          satisfied  \n",
       "5                                           november  \n",
       "6                                               tune  \n",
       "7                                             return  \n",
       "8                                              fully  \n",
       "9                                            results  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(topic_info)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize Topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "customdata": [
          [
           0,
           "language | natural | model | word | planning",
           723
          ],
          [
           1,
           "state | rnn | memory | vector | input",
           198
          ],
          [
           2,
           "function | distribution | output | class | tanh",
           122
          ],
          [
           3,
           "cost | function | sgd | training | expected",
           61
          ],
          [
           4,
           "publication | april | syst | technol | date",
           57
          ],
          [
           5,
           "graph | computation | node | nodes | backward",
           46
          ],
          [
           6,
           "llama | like | south | animal | america",
           29
          ],
          [
           7,
           "tangkhul | compound | root | morphological | verbs",
           17
          ],
          [
           8,
           "want | edu | dsontagcoursesinferenceslidespseudolikelihoodnotespdf | regardlessly | satisfied",
           13
          ]
         ],
         "hovertemplate": "<b>Topic %{customdata[0]}</b><br>%{customdata[1]}<br>Size: %{customdata[2]}",
         "legendgroup": "",
         "marker": {
          "color": "#B0BEC5",
          "line": {
           "color": "DarkSlateGrey",
           "width": 2
          },
          "size": [
           723,
           198,
           122,
           61,
           57,
           46,
           29,
           17,
           13
          ],
          "sizemode": "area",
          "sizeref": 0.451875,
          "symbol": "circle"
         },
         "mode": "markers",
         "name": "",
         "orientation": "v",
         "showlegend": false,
         "type": "scatter",
         "x": [
          14.759990692138672,
          14.329012870788574,
          10.99558162689209,
          9.891719818115234,
          11.191701889038086,
          9.449606895446777,
          11.662773132324219,
          14.039092063903809,
          12.023329734802246
         ],
         "xaxis": "x",
         "y": [
          1.6729466915130615,
          2.2927768230438232,
          5.36309289932251,
          5.59792423248291,
          4.721500873565674,
          5.3096089363098145,
          5.3371052742004395,
          1.8039934635162354,
          4.149565696716309
         ],
         "yaxis": "y"
        }
       ],
       "layout": {
        "annotations": [
         {
          "showarrow": false,
          "text": "D1",
          "x": 8.03216586112976,
          "y": 3.929808777570724,
          "yshift": 10
         },
         {
          "showarrow": false,
          "text": "D2",
          "x": 12.503077578544616,
          "xshift": 10,
          "y": 6.437612867355346
         }
        ],
        "height": 650,
        "hoverlabel": {
         "bgcolor": "white",
         "font": {
          "family": "Rockwell",
          "size": 16
         }
        },
        "legend": {
         "itemsizing": "constant",
         "tracegroupgap": 0
        },
        "margin": {
         "t": 60
        },
        "shapes": [
         {
          "line": {
           "color": "#CFD8DC",
           "width": 2
          },
          "type": "line",
          "x0": 12.503077578544616,
          "x1": 12.503077578544616,
          "y0": 1.4220046877861023,
          "y1": 6.437612867355346
         },
         {
          "line": {
           "color": "#9E9E9E",
           "width": 2
          },
          "type": "line",
          "x0": 8.03216586112976,
          "x1": 16.973989295959473,
          "y0": 3.929808777570724,
          "y1": 3.929808777570724
         }
        ],
        "sliders": [
         {
          "active": 0,
          "pad": {
           "t": 50
          },
          "steps": [
           {
            "args": [
             {
              "marker.color": [
               [
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 0",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 1",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 2",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 3",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 4",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 5",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 6",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 7",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red"
               ]
              ]
             }
            ],
            "label": "Topic 8",
            "method": "update"
           }
          ]
         }
        ],
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "rgb(36,36,36)"
            },
            "error_y": {
             "color": "rgb(36,36,36)"
            },
            "marker": {
             "line": {
              "color": "white",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "white",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "rgb(36,36,36)",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "rgb(36,36,36)"
            },
            "baxis": {
             "endlinecolor": "rgb(36,36,36)",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "rgb(36,36,36)"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "heatmapgl": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "heatmapgl"
           }
          ],
          "histogram": [
           {
            "marker": {
             "line": {
              "color": "white",
              "width": 0.6
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "rgb(237,237,237)"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "rgb(217,217,217)"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 1,
            "tickcolor": "rgb(36,36,36)",
            "ticks": "outside"
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "rgb(103,0,31)"
            ],
            [
             0.1,
             "rgb(178,24,43)"
            ],
            [
             0.2,
             "rgb(214,96,77)"
            ],
            [
             0.3,
             "rgb(244,165,130)"
            ],
            [
             0.4,
             "rgb(253,219,199)"
            ],
            [
             0.5,
             "rgb(247,247,247)"
            ],
            [
             0.6,
             "rgb(209,229,240)"
            ],
            [
             0.7,
             "rgb(146,197,222)"
            ],
            [
             0.8,
             "rgb(67,147,195)"
            ],
            [
             0.9,
             "rgb(33,102,172)"
            ],
            [
             1,
             "rgb(5,48,97)"
            ]
           ],
           "sequential": [
            [
             0,
             "#440154"
            ],
            [
             0.1111111111111111,
             "#482878"
            ],
            [
             0.2222222222222222,
             "#3e4989"
            ],
            [
             0.3333333333333333,
             "#31688e"
            ],
            [
             0.4444444444444444,
             "#26828e"
            ],
            [
             0.5555555555555556,
             "#1f9e89"
            ],
            [
             0.6666666666666666,
             "#35b779"
            ],
            [
             0.7777777777777778,
             "#6ece58"
            ],
            [
             0.8888888888888888,
             "#b5de2b"
            ],
            [
             1,
             "#fde725"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#440154"
            ],
            [
             0.1111111111111111,
             "#482878"
            ],
            [
             0.2222222222222222,
             "#3e4989"
            ],
            [
             0.3333333333333333,
             "#31688e"
            ],
            [
             0.4444444444444444,
             "#26828e"
            ],
            [
             0.5555555555555556,
             "#1f9e89"
            ],
            [
             0.6666666666666666,
             "#35b779"
            ],
            [
             0.7777777777777778,
             "#6ece58"
            ],
            [
             0.8888888888888888,
             "#b5de2b"
            ],
            [
             1,
             "#fde725"
            ]
           ]
          },
          "colorway": [
           "#1F77B4",
           "#FF7F0E",
           "#2CA02C",
           "#D62728",
           "#9467BD",
           "#8C564B",
           "#E377C2",
           "#7F7F7F",
           "#BCBD22",
           "#17BECF"
          ],
          "font": {
           "color": "rgb(36,36,36)"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "white",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "white",
          "polar": {
           "angularaxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           },
           "bgcolor": "white",
           "radialaxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "white",
            "gridcolor": "rgb(232,232,232)",
            "gridwidth": 2,
            "linecolor": "rgb(36,36,36)",
            "showbackground": true,
            "showgrid": false,
            "showline": true,
            "ticks": "outside",
            "zeroline": false,
            "zerolinecolor": "rgb(36,36,36)"
           },
           "yaxis": {
            "backgroundcolor": "white",
            "gridcolor": "rgb(232,232,232)",
            "gridwidth": 2,
            "linecolor": "rgb(36,36,36)",
            "showbackground": true,
            "showgrid": false,
            "showline": true,
            "ticks": "outside",
            "zeroline": false,
            "zerolinecolor": "rgb(36,36,36)"
           },
           "zaxis": {
            "backgroundcolor": "white",
            "gridcolor": "rgb(232,232,232)",
            "gridwidth": 2,
            "linecolor": "rgb(36,36,36)",
            "showbackground": true,
            "showgrid": false,
            "showline": true,
            "ticks": "outside",
            "zeroline": false,
            "zerolinecolor": "rgb(36,36,36)"
           }
          },
          "shapedefaults": {
           "fillcolor": "black",
           "line": {
            "width": 0
           },
           "opacity": 0.3
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           },
           "baxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           },
           "bgcolor": "white",
           "caxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "rgb(232,232,232)",
           "linecolor": "rgb(36,36,36)",
           "showgrid": false,
           "showline": true,
           "ticks": "outside",
           "title": {
            "standoff": 15
           },
           "zeroline": false,
           "zerolinecolor": "rgb(36,36,36)"
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "rgb(232,232,232)",
           "linecolor": "rgb(36,36,36)",
           "showgrid": false,
           "showline": true,
           "ticks": "outside",
           "title": {
            "standoff": 15
           },
           "zeroline": false,
           "zerolinecolor": "rgb(36,36,36)"
          }
         }
        },
        "title": {
         "font": {
          "color": "Black",
          "size": 22
         },
         "text": "<b>Intertopic Distance Map</b>",
         "x": 0.5,
         "xanchor": "center",
         "y": 0.95,
         "yanchor": "top"
        },
        "width": 650,
        "xaxis": {
         "anchor": "y",
         "domain": [
          0,
          1
         ],
         "range": [
          8.03216586112976,
          16.973989295959473
         ],
         "title": {
          "text": ""
         },
         "visible": false
        },
        "yaxis": {
         "anchor": "x",
         "domain": [
          0,
          1
         ],
         "range": [
          1.4220046877861023,
          6.437612867355346
         ],
         "title": {
          "text": ""
         },
         "visible": false
        }
       }
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "topic_model.visualize_topics()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								{
 								 "cells": [
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! "
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Import General Use Packages"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 1,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "import arxiv  # Interact with arXiv api to scrape papers\n",
 								    "from sentence_transformers import (\n",
 								    "    SentenceTransformer,\n",
 								    ")  # Use Hugging Face Embedding for Topic Modelling\n",
 								    "from bertopic import BERTopic  # Package for Topic Modelling\n",
 								    "from tqdm import tqdm  # Progress Bar When Iterating\n",
 								    "import glob  # Identify Files in Directory\n",
 								    "import os  # Delete Files in Directory\n",
 								    "import pandas as pd  # Dataframe Manipulation"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												docs: no more bricks (#1967)

### Summary

We no longer use the "bricks" terminology for partioning functions, etc
in the library. This PR updates various references to bricks within the
repo and the docs. This is just an initial pass to swap the terminology
out, it'll likely be helpful to reorganize the docs a bit as well.

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
											
										
										
											2023-11-02 10:43:26 -04:00
+								    "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Functions"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 2,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "from unstructured.partition.auto import partition  # Base Function to Partition PDF\n",
 								    "from unstructured.staging.base import (\n",
 								    "    convert_to_dict,\n",
 								    ")  # Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n",
 								    "from unstructured.cleaners.core import (\n",
 								    "    clean,\n",
 								    "    remove_punctuation,\n",
 								    "    clean_non_ascii_chars,\n",
-												docs: no more bricks (#1967)

### Summary

We no longer use the "bricks" terminology for partioning functions, etc
in the library. This PR updates various references to bricks within the
repo and the docs. This is just an initial pass to swap the terminology
out, it'll likely be helpful to reorganize the docs a bit as well.

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
											
										
										
											2023-11-02 10:43:26 -04:00
+								    ")  # Cleaning Functions\n",
 								    "import re  # Create Custom Cleaning Function\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "import nltk  # Toolkit for more advanced pre-processing\n",
 								    "from nltk.corpus import stopwords  # list of stopwords to remove\n",
 								    "from typing import List  # Type Hinting"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Setup NLTK"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 3,
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stderr",
 								     "output_type": "stream",
 								     "text": [
 								      "[nltk_data] Downloading package stopwords to\n",
 								      "[nltk_data]     /Users/pravinsanthanam/nltk_data...\n",
 								      "[nltk_data]   Package stopwords is already up-to-date!\n"
 								     ]
 								    },
 								    {
 								     "data": {
 								      "text/plain": [
 								       "True"
 								      ]
 								     },
 								     "execution_count": 3,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
 								   "source": [
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "nltk.download(\"stopwords\")"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Create Function to Extract PDFs About Machine Learning from arXiv"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 6,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n",
 								    "    \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n",
 								    "\n",
 								    "    Args:\n",
 								    "        query (str): query for arXiv API\n",
 								    "        max_results (int, optional): Number of Papers to get back. Defaults to 100.\n",
 								    "\n",
 								    "    Returns:\n",
 								    "        paper_texts (list[str]): Return list of narrative texts for each paper\n",
 								    "    \"\"\"\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "    # Get List of Arxiv Papers Matching Our Query\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "    arxiv_papers = list(\n",
 								    "        arxiv.Search(\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "            query=query,\n",
 								    "            max_results=max_results,\n",
 								    "            sort_by=arxiv.SortCriterion.Relevance,\n",
 								    "            sort_order=arxiv.SortOrder.Descending,\n",
 								    "        ).results()\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "    )\n",
 								    "\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "    # Loop Through PDFs, Download and Pre-Process and Then Delete\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "    paper_texts = []\n",
 								    "    for paper in tqdm(arxiv_papers):\n",
 								    "        paper.download_pdf()\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "        pdf_file = glob.glob(\"*.pdf\")[0]\n",
 								    "        elements = partition(pdf_file)  # Partition PDF Using Unstructured\n",
 								    "        isd = convert_to_dict(elements)  # Convert List of Elements to List of Dictionaries\n",
 								    "        narrative_texts = [\n",
 								    "            element[\"text\"] for element in isd if element[\"type\"] == \"NarrativeText\"\n",
 								    "        ]  # Only Keep Narrative Text and Combine Into One String\n",
 								    "        os.remove(pdf_file)  # Delete PDF\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "        paper_texts += narrative_texts\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "    return paper_texts"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 7,
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stderr",
 								     "output_type": "stream",
 								     "text": [
 								      "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n"
 								     ]
 								    }
 								   ],
 								   "source": [
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "paper_texts = get_arxiv_paper_texts(query=\"natural language processing\", max_results=10)"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												docs: no more bricks (#1967)

### Summary

We no longer use the "bricks" terminology for partioning functions, etc
in the library. This PR updates various references to bricks within the
repo and the docs. This is just an initial pass to swap the terminology
out, it'll likely be helpful to reorganize the docs a bit as well.

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
											
										
										
											2023-11-02 10:43:26 -04:00
+								    "### Run Narrative Texts Through Custom Cleaner Function Using Unstructured"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 8,
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "Number of Narrative Texts to Run Through Topic Modelling: 1711\n"
 								     ]
 								    }
 								   ],
 								   "source": [
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "# Stopwords to Remove\n",
 								    "stop_words = set(stopwords.words(\"english\"))\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "\n",
-												docs: no more bricks (#1967)

### Summary

We no longer use the "bricks" terminology for partioning functions, etc
in the library. This PR updates various references to bricks within the
repo and the docs. This is just an initial pass to swap the terminology
out, it'll likely be helpful to reorganize the docs a bit as well.

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
											
										
										
											2023-11-02 10:43:26 -04:00
+								    "# Function to Apply Whatever Cleaning Functionality to Each Narrative Text Element\n",
 								    "def custom_clean_function(narrative_text: str) -> str:\n",
 								    "    \"\"\"Apply Mix of Unstructured Cleaning Functions With Some Custom Functionality to Pre-Process Narrative Text\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "\n",
 								    "    Args:\n",
 								    "        narrative_text (str): Narrative Text or Any Other Sentence\n",
 								    "\n",
 								    "    Returns:\n",
 								    "        cleaned_text (str): Text after going through all the cleaning procedures\n",
 								    "    \"\"\"\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "    remove_numbers = lambda text: re.sub(\n",
 								    "        r\"\\d+\", \"\", text\n",
 								    "    )  # lambda function to remove all punctuation\n",
 								    "    cleaned_text = remove_numbers(narrative_text)  # Apply Custom Lambda\n",
 								    "    cleaned_text = clean(\n",
 								    "        cleaned_text,\n",
 								    "        extra_whitespace=True,\n",
 								    "        dashes=True,\n",
 								    "        bullets=True,\n",
 								    "        trailing_punctuation=True,\n",
 								    "        lowercase=True,\n",
-												docs: no more bricks (#1967)

### Summary

We no longer use the "bricks" terminology for partioning functions, etc
in the library. This PR updates various references to bricks within the
repo and the docs. This is just an initial pass to swap the terminology
out, it'll likely be helpful to reorganize the docs a bit as well.

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
											
										
										
											2023-11-02 10:43:26 -04:00
+								    "    )  # Apply Basic Clean Function With All the Options\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "    cleaned_text = remove_punctuation(cleaned_text)  # Remove all punctuation\n",
 								    "    cleaned_text = \" \".join(\n",
 								    "        [word for word in cleaned_text.split() if word not in stop_words]\n",
 								    "    )  # remove stop words\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "    return cleaned_text\n",
 								    "\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "\n",
 								    "# Apply Function to Paper Texts\n",
-												docs: no more bricks (#1967)

### Summary

We no longer use the "bricks" terminology for partioning functions, etc
in the library. This PR updates various references to bricks within the
repo and the docs. This is just an initial pass to swap the terminology
out, it'll likely be helpful to reorganize the docs a bit as well.

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
											
										
										
											2023-11-02 10:43:26 -04:00
+								    "cleaned_paper_texts = [custom_clean_function(text) for text in paper_texts]\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "# Count Narratve Texts\n",
 								    "print(\n",
 								    "    \"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts))\n",
 								    ")"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 9,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "# Choose Which Hugging Face Model You Want to Use\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
 								    "\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "# Initialize Model\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 10,
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "data": {
 								      "application/vnd.jupyter.widget-view+json": {
 								       "model_id": "a6ebe3cb185049bd8d37742f2451cbe0",
 								       "version_major": 2,
 								       "version_minor": 0
 								      },
 								      "text/plain": [
 								       "Batches:   0%|          | 0/54 [00:00<?, ?it/s]"
 								      ]
 								     },
 								     "metadata": {},
 								     "output_type": "display_data"
 								    },
 								    {
 								     "name": "stderr",
 								     "output_type": "stream",
 								     "text": [
 								      "2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n",
 								      "2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n",
 								      "2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n",
 								      "2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n"
 								     ]
 								    }
 								   ],
 								   "source": [
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "# Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "topic_model.fit(cleaned_paper_texts)\n",
 								    "\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "# Store Document-Topic Info\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n",
 								    "\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "# Store Topic Info\n",
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								    "topic_info = pd.DataFrame(topic_model.get_topics())\n",
 								    "topic_info = topic_info.applymap(lambda x: x[0])\n",
-												Chore (refactor): support table extraction with pre-computed ocr data (#1801)

### Summary

Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.

**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image

### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:

screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">

### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR` 

### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`

---------

Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-20 20:24:23 -04:00
+								    "topic_info.columns = [\"topic_{}\".format(col + 1) for col in topic_info.columns]"
-												Went through this demo notebook with Matt. Decision was made to add it to our collection of examples for use later. (#484)


											
										
										
											2023-04-17 11:53:25 -04:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Checkout Keywords for Each Topic"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 11,
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>topic_0</th>\n",
 								       "      <th>topic_1</th>\n",
 								       "      <th>topic_2</th>\n",
 								       "      <th>topic_3</th>\n",
 								       "      <th>topic_4</th>\n",
 								       "      <th>topic_5</th>\n",
 								       "      <th>topic_6</th>\n",
 								       "      <th>topic_7</th>\n",
 								       "      <th>topic_8</th>\n",
 								       "      <th>topic_9</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>neural</td>\n",
 								       "      <td>language</td>\n",
 								       "      <td>state</td>\n",
 								       "      <td>function</td>\n",
 								       "      <td>cost</td>\n",
 								       "      <td>publication</td>\n",
 								       "      <td>graph</td>\n",
 								       "      <td>llama</td>\n",
 								       "      <td>tangkhul</td>\n",
 								       "      <td>want</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>network</td>\n",
 								       "      <td>natural</td>\n",
 								       "      <td>rnn</td>\n",
 								       "      <td>distribution</td>\n",
 								       "      <td>function</td>\n",
 								       "      <td>april</td>\n",
 								       "      <td>computation</td>\n",
 								       "      <td>like</td>\n",
 								       "      <td>compound</td>\n",
 								       "      <td>edu</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>2</th>\n",
 								       "      <td>function</td>\n",
 								       "      <td>model</td>\n",
 								       "      <td>memory</td>\n",
 								       "      <td>output</td>\n",
 								       "      <td>sgd</td>\n",
 								       "      <td>syst</td>\n",
 								       "      <td>node</td>\n",
 								       "      <td>south</td>\n",
 								       "      <td>root</td>\n",
 								       "      <td>dsontagcoursesinferenceslidespseudolikelihoodn...</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>3</th>\n",
 								       "      <td>networks</td>\n",
 								       "      <td>word</td>\n",
 								       "      <td>vector</td>\n",
 								       "      <td>class</td>\n",
 								       "      <td>training</td>\n",
 								       "      <td>technol</td>\n",
 								       "      <td>nodes</td>\n",
 								       "      <td>animal</td>\n",
 								       "      <td>morphological</td>\n",
 								       "      <td>regardlessly</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>4</th>\n",
 								       "      <td>one</td>\n",
 								       "      <td>planning</td>\n",
 								       "      <td>input</td>\n",
 								       "      <td>tanh</td>\n",
 								       "      <td>expected</td>\n",
 								       "      <td>date</td>\n",
 								       "      <td>backward</td>\n",
 								       "      <td>america</td>\n",
 								       "      <td>verbs</td>\n",
 								       "      <td>satisfied</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>5</th>\n",
 								       "      <td>input</td>\n",
 								       "      <td>words</td>\n",
 								       "      <td>network</td>\n",
 								       "      <td>data</td>\n",
 								       "      <td>optimization</td>\n",
 								       "      <td>vol</td>\n",
 								       "      <td>function</td>\n",
 								       "      <td>translation</td>\n",
 								       "      <td>noun</td>\n",
 								       "      <td>november</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>6</th>\n",
 								       "      <td>vector</td>\n",
 								       "      <td>based</td>\n",
 								       "      <td>recurrent</td>\n",
 								       "      <td>yˆ</td>\n",
 								       "      <td>algorithm</td>\n",
 								       "      <td>intell</td>\n",
 								       "      <td>backpropagation</td>\n",
 								       "      <td>french</td>\n",
 								       "      <td>roots</td>\n",
 								       "      <td>tune</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>7</th>\n",
 								       "      <td>language</td>\n",
 								       "      <td>processing</td>\n",
 								       "      <td>sequence</td>\n",
 								       "      <td>loss</td>\n",
 								       "      <td>set</td>\n",
 								       "      <td>acm</td>\n",
 								       "      <td>algorithm</td>\n",
 								       "      <td>cute</td>\n",
 								       "      <td>adjectives</td>\n",
 								       "      <td>return</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>8</th>\n",
 								       "      <td>model</td>\n",
 								       "      <td>models</td>\n",
 								       "      <td>neural</td>\n",
 								       "      <td>activation</td>\n",
 								       "      <td>validation</td>\n",
 								       "      <td>article</td>\n",
 								       "      <td>parameters</td>\n",
 								       "      <td>google</td>\n",
 								       "      <td>formation</td>\n",
 								       "      <td>fully</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>9</th>\n",
 								       "      <td>training</td>\n",
 								       "      <td>data</td>\n",
 								       "      <td>lstm</td>\n",
 								       "      <td>softmax</td>\n",
 								       "      <td>rate</td>\n",
 								       "      <td>trans</td>\n",
 								       "      <td>output</td>\n",
 								       "      <td>domesticated</td>\n",
 								       "      <td>language</td>\n",
 								       "      <td>results</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "    topic_0     topic_1    topic_2       topic_3       topic_4      topic_5  \\\n",
 								       "0    neural    language      state      function          cost  publication   \n",
 								       "1   network     natural        rnn  distribution      function        april   \n",
 								       "2  function       model     memory        output           sgd         syst   \n",
 								       "3  networks        word     vector         class      training      technol   \n",
 								       "4       one    planning      input          tanh      expected         date   \n",
 								       "5     input       words    network          data  optimization          vol   \n",
 								       "6    vector       based  recurrent            yˆ     algorithm       intell   \n",
 								       "7  language  processing   sequence          loss           set          acm   \n",
 								       "8     model      models     neural    activation    validation      article   \n",
 								       "9  training        data       lstm       softmax          rate        trans   \n",
 								       "\n",
 								       "           topic_6       topic_7        topic_8  \\\n",
 								       "0            graph         llama       tangkhul   \n",
 								       "1      computation          like       compound   \n",
 								       "2             node         south           root   \n",
 								       "3            nodes        animal  morphological   \n",
 								       "4         backward       america          verbs   \n",
 								       "5         function   translation           noun   \n",
 								       "6  backpropagation        french          roots   \n",
 								       "7        algorithm          cute     adjectives   \n",
 								       "8       parameters        google      formation   \n",
 								       "9           output  domesticated       language   \n",
 								       "\n",
 								       "                                             topic_9  \n",
 								       "0                                               want  \n",
 								       "1                                                edu  \n",
 								       "2  dsontagcoursesinferenceslidespseudolikelihoodn...  \n",
 								       "3                                       regardlessly  \n",
 								       "4                                          satisfied  \n",
 								       "5                                           november  \n",
 								       "6                                               tune  \n",
 								       "7                                             return  \n",
 								       "8                                              fully  \n",
 								       "9                                            results  "
 								      ]
 								     },
 								     "metadata": {},
 								     "output_type": "display_data"
 								    }
 								   ],
 								   "source": [
 								    "display(topic_info)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "### Visualize Topics"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 13,
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "data": {
 								      "application/vnd.plotly.v1+json": {
 								       "config": {
 								        "plotlyServerURL": "https://plot.ly"
 								       },
 								       "data": [
 								        {
 								         "customdata": [
 								          [
 ,
 								           "language | natural | model | word | planning",
 
 								          ],
 								          [
 ,
 								           "state | rnn | memory | vector | input",
 
 								          ],
 								          [
 ,
 								           "function | distribution | output | class | tanh",
 
 								          ],
 								          [
 ,
 								           "cost | function | sgd | training | expected",
 
 								          ],
 								          [
 ,
 								           "publication | april | syst | technol | date",
 
 								          ],
 								          [
 ,
 								           "graph | computation | node | nodes | backward",
 
 								          ],
 								          [
 ,
 								           "llama | like | south | animal | america",
 
 								          ],
 								          [
 ,
 								           "tangkhul | compound | root | morphological | verbs",
 
 								          ],
 								          [
 ,
 								           "want | edu | dsontagcoursesinferenceslidespseudolikelihoodnotespdf | regardlessly | satisfied",
 
 								          ]
 								         ],
 								         "hovertemplate": "<b>Topic %{customdata[0]}</b><br>%{customdata[1]}<br>Size: %{customdata[2]}",
 								         "legendgroup": "",
 								         "marker": {
 								          "color": "#B0BEC5",
 								          "line": {
 								           "color": "DarkSlateGrey",
 								           "width": 2
 								          },
 								          "size": [
 ,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
 
 								          ],
 								          "sizemode": "area",
 								          "sizeref": 0.451875,
 								          "symbol": "circle"
 								         },
 								         "mode": "markers",
 								         "name": "",
 								         "orientation": "v",
 								         "showlegend": false,
 								         "type": "scatter",
 								         "x": [
 .759990692138672,
 .329012870788574,
 .99558162689209,
 .891719818115234,
 .191701889038086,
 .449606895446777,
 .662773132324219,
 .039092063903809,
 .023329734802246
 								         ],
 								         "xaxis": "x",
 								         "y": [
 .6729466915130615,
 .2927768230438232,
 .36309289932251,
 .59792423248291,
 .721500873565674,
 .3096089363098145,
 .3371052742004395,
 .8039934635162354,
 .149565696716309
 								         ],
 								         "yaxis": "y"
 								        }
 								       ],
 								       "layout": {
 								        "annotations": [
 								         {
 								          "showarrow": false,
 								          "text": "D1",
 								          "x": 8.03216586112976,
 								          "y": 3.929808777570724,
 								          "yshift": 10
 								         },
 								         {
 								          "showarrow": false,
 								          "text": "D2",
 								          "x": 12.503077578544616,
 								          "xshift": 10,
 								          "y": 6.437612867355346
 								         }
 								        ],
 								        "height": 650,
 								        "hoverlabel": {
 								         "bgcolor": "white",
 								         "font": {
 								          "family": "Rockwell",
 								          "size": 16
 								         }
 								        },
 								        "legend": {
 								         "itemsizing": "constant",
 								         "tracegroupgap": 0
 								        },
 								        "margin": {
 								         "t": 60
 								        },
 								        "shapes": [
 								         {
 								          "line": {
 								           "color": "#CFD8DC",
 								           "width": 2
 								          },
 								          "type": "line",
 								          "x0": 12.503077578544616,
 								          "x1": 12.503077578544616,
 								          "y0": 1.4220046877861023,
 								          "y1": 6.437612867355346
 								         },
 								         {
 								          "line": {
 								           "color": "#9E9E9E",
 								           "width": 2
 								          },
 								          "type": "line",
 								          "x0": 8.03216586112976,
 								          "x1": 16.973989295959473,
 								          "y0": 3.929808777570724,
 								          "y1": 3.929808777570724
 								         }
 								        ],
 								        "sliders": [
 								         {
 								          "active": 0,
 								          "pad": {
 								           "t": 50
 								          },
 								          "steps": [
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "red",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 0",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "red",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 1",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "red",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 2",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "red",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 3",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "red",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 4",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "red",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 5",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "red",
 								                "#B0BEC5",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 6",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "red",
 								                "#B0BEC5"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 7",
 								            "method": "update"
 								           },
 								           {
 								            "args": [
 								             {
 								              "marker.color": [
 								               [
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "#B0BEC5",
 								                "red"
 								               ]
 								              ]
 								             }
 								            ],
 								            "label": "Topic 8",
 								            "method": "update"
 								           }
 								          ]
 								         }
 								        ],
 								        "template": {
 								         "data": {
 								          "bar": [
 								           {
 								            "error_x": {
 								             "color": "rgb(36,36,36)"
 								            },
 								            "error_y": {
 								             "color": "rgb(36,36,36)"
 								            },
 								            "marker": {
 								             "line": {
 								              "color": "white",
 								              "width": 0.5
 								             },
 								             "pattern": {
 								              "fillmode": "overlay",
 								              "size": 10,
 								              "solidity": 0.2
 								             }
 								            },
 								            "type": "bar"
 								           }
 								          ],
 								          "barpolar": [
 								           {
 								            "marker": {
 								             "line": {
 								              "color": "white",
 								              "width": 0.5
 								             },
 								             "pattern": {
 								              "fillmode": "overlay",
 								              "size": 10,
 								              "solidity": 0.2
 								             }
 								            },
 								            "type": "barpolar"
 								           }
 								          ],
 								          "carpet": [
 								           {
 								            "aaxis": {
 								             "endlinecolor": "rgb(36,36,36)",
 								             "gridcolor": "white",
 								             "linecolor": "white",
 								             "minorgridcolor": "white",
 								             "startlinecolor": "rgb(36,36,36)"
 								            },
 								            "baxis": {
 								             "endlinecolor": "rgb(36,36,36)",
 								             "gridcolor": "white",
 								             "linecolor": "white",
 								             "minorgridcolor": "white",
 								             "startlinecolor": "rgb(36,36,36)"
 								            },
 								            "type": "carpet"
 								           }
 								          ],
 								          "choropleth": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "type": "choropleth"
 								           }
 								          ],
 								          "contour": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "colorscale": [
 								             [
 ,
 								              "#440154"
 								             ],
 								             [
 .1111111111111111,
 								              "#482878"
 								             ],
 								             [
 .2222222222222222,
 								              "#3e4989"
 								             ],
 								             [
 .3333333333333333,
 								              "#31688e"
 								             ],
 								             [
 .4444444444444444,
 								              "#26828e"
 								             ],
 								             [
 .5555555555555556,
 								              "#1f9e89"
 								             ],
 								             [
 .6666666666666666,
 								              "#35b779"
 								             ],
 								             [
 .7777777777777778,
 								              "#6ece58"
 								             ],
 								             [
 .8888888888888888,
 								              "#b5de2b"
 								             ],
 								             [
 ,
 								              "#fde725"
 								             ]
 								            ],
 								            "type": "contour"
 								           }
 								          ],
 								          "contourcarpet": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "type": "contourcarpet"
 								           }
 								          ],
 								          "heatmap": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "colorscale": [
 								             [
 ,
 								              "#440154"
 								             ],
 								             [
 .1111111111111111,
 								              "#482878"
 								             ],
 								             [
 .2222222222222222,
 								              "#3e4989"
 								             ],
 								             [
 .3333333333333333,
 								              "#31688e"
 								             ],
 								             [
 .4444444444444444,
 								              "#26828e"
 								             ],
 								             [
 .5555555555555556,
 								              "#1f9e89"
 								             ],
 								             [
 .6666666666666666,
 								              "#35b779"
 								             ],
 								             [
 .7777777777777778,
 								              "#6ece58"
 								             ],
 								             [
 .8888888888888888,
 								              "#b5de2b"
 								             ],
 								             [
 ,
 								              "#fde725"
 								             ]
 								            ],
 								            "type": "heatmap"
 								           }
 								          ],
 								          "heatmapgl": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "colorscale": [
 								             [
 ,
 								              "#440154"
 								             ],
 								             [
 .1111111111111111,
 								              "#482878"
 								             ],
 								             [
 .2222222222222222,
 								              "#3e4989"
 								             ],
 								             [
 .3333333333333333,
 								              "#31688e"
 								             ],
 								             [
 .4444444444444444,
 								              "#26828e"
 								             ],
 								             [
 .5555555555555556,
 								              "#1f9e89"
 								             ],
 								             [
 .6666666666666666,
 								              "#35b779"
 								             ],
 								             [
 .7777777777777778,
 								              "#6ece58"
 								             ],
 								             [
 .8888888888888888,
 								              "#b5de2b"
 								             ],
 								             [
 ,
 								              "#fde725"
 								             ]
 								            ],
 								            "type": "heatmapgl"
 								           }
 								          ],
 								          "histogram": [
 								           {
 								            "marker": {
 								             "line": {
 								              "color": "white",
 								              "width": 0.6
 								             }
 								            },
 								            "type": "histogram"
 								           }
 								          ],
 								          "histogram2d": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "colorscale": [
 								             [
 ,
 								              "#440154"
 								             ],
 								             [
 .1111111111111111,
 								              "#482878"
 								             ],
 								             [
 .2222222222222222,
 								              "#3e4989"
 								             ],
 								             [
 .3333333333333333,
 								              "#31688e"
 								             ],
 								             [
 .4444444444444444,
 								              "#26828e"
 								             ],
 								             [
 .5555555555555556,
 								              "#1f9e89"
 								             ],
 								             [
 .6666666666666666,
 								              "#35b779"
 								             ],
 								             [
 .7777777777777778,
 								              "#6ece58"
 								             ],
 								             [
 .8888888888888888,
 								              "#b5de2b"
 								             ],
 								             [
 ,
 								              "#fde725"
 								             ]
 								            ],
 								            "type": "histogram2d"
 								           }
 								          ],
 								          "histogram2dcontour": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "colorscale": [
 								             [
 ,
 								              "#440154"
 								             ],
 								             [
 .1111111111111111,
 								              "#482878"
 								             ],
 								             [
 .2222222222222222,
 								              "#3e4989"
 								             ],
 								             [
 .3333333333333333,
 								              "#31688e"
 								             ],
 								             [
 .4444444444444444,
 								              "#26828e"
 								             ],
 								             [
 .5555555555555556,
 								              "#1f9e89"
 								             ],
 								             [
 .6666666666666666,
 								              "#35b779"
 								             ],
 								             [
 .7777777777777778,
 								              "#6ece58"
 								             ],
 								             [
 .8888888888888888,
 								              "#b5de2b"
 								             ],
 								             [
 ,
 								              "#fde725"
 								             ]
 								            ],
 								            "type": "histogram2dcontour"
 								           }
 								          ],
 								          "mesh3d": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "type": "mesh3d"
 								           }
 								          ],
 								          "parcoords": [
 								           {
 								            "line": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "parcoords"
 								           }
 								          ],
 								          "pie": [
 								           {
 								            "automargin": true,
 								            "type": "pie"
 								           }
 								          ],
 								          "scatter": [
 								           {
 								            "fillpattern": {
 								             "fillmode": "overlay",
 								             "size": 10,
 								             "solidity": 0.2
 								            },
 								            "type": "scatter"
 								           }
 								          ],
 								          "scatter3d": [
 								           {
 								            "line": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scatter3d"
 								           }
 								          ],
 								          "scattercarpet": [
 								           {
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scattercarpet"
 								           }
 								          ],
 								          "scattergeo": [
 								           {
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scattergeo"
 								           }
 								          ],
 								          "scattergl": [
 								           {
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scattergl"
 								           }
 								          ],
 								          "scattermapbox": [
 								           {
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scattermapbox"
 								           }
 								          ],
 								          "scatterpolar": [
 								           {
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scatterpolar"
 								           }
 								          ],
 								          "scatterpolargl": [
 								           {
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scatterpolargl"
 								           }
 								          ],
 								          "scatterternary": [
 								           {
 								            "marker": {
 								             "colorbar": {
 								              "outlinewidth": 1,
 								              "tickcolor": "rgb(36,36,36)",
 								              "ticks": "outside"
 								             }
 								            },
 								            "type": "scatterternary"
 								           }
 								          ],
 								          "surface": [
 								           {
 								            "colorbar": {
 								             "outlinewidth": 1,
 								             "tickcolor": "rgb(36,36,36)",
 								             "ticks": "outside"
 								            },
 								            "colorscale": [
 								             [
 ,
 								              "#440154"
 								             ],
 								             [
 .1111111111111111,
 								              "#482878"
 								             ],
 								             [
 .2222222222222222,
 								              "#3e4989"
 								             ],
 								             [
 .3333333333333333,
 								              "#31688e"
 								             ],
 								             [
 .4444444444444444,
 								              "#26828e"
 								             ],
 								             [
 .5555555555555556,
 								              "#1f9e89"
 								             ],
 								             [
 .6666666666666666,
 								              "#35b779"
 								             ],
 								             [
 .7777777777777778,
 								              "#6ece58"
 								             ],
 								             [
 .8888888888888888,
 								              "#b5de2b"
 								             ],
 								             [
 ,
 								              "#fde725"
 								             ]
 								            ],
 								            "type": "surface"
 								           }
 								          ],
 								          "table": [
 								           {
 								            "cells": {
 								             "fill": {
 								              "color": "rgb(237,237,237)"
 								             },
 								             "line": {
 								              "color": "white"
 								             }
 								            },
 								            "header": {
 								             "fill": {
 								              "color": "rgb(217,217,217)"
 								             },
 								             "line": {
 								              "color": "white"
 								             }
 								            },
 								            "type": "table"
 								           }
 								          ]
 								         },
 								         "layout": {
 								          "annotationdefaults": {
 								           "arrowhead": 0,
 								           "arrowwidth": 1
 								          },
 								          "autotypenumbers": "strict",
 								          "coloraxis": {
 								           "colorbar": {
 								            "outlinewidth": 1,
 								            "tickcolor": "rgb(36,36,36)",
 								            "ticks": "outside"
 								           }
 								          },
 								          "colorscale": {
 								           "diverging": [
 								            [
 ,
 								             "rgb(103,0,31)"
 								            ],
 								            [
 .1,
 								             "rgb(178,24,43)"
 								            ],
 								            [
 .2,
 								             "rgb(214,96,77)"
 								            ],
 								            [
 .3,
 								             "rgb(244,165,130)"
 								            ],
 								            [
 .4,
 								             "rgb(253,219,199)"
 								            ],
 								            [
 .5,
 								             "rgb(247,247,247)"
 								            ],
 								            [
 .6,
 								             "rgb(209,229,240)"
 								            ],
 								            [
 .7,
 								             "rgb(146,197,222)"
 								            ],
 								            [
 .8,
 								             "rgb(67,147,195)"
 								            ],
 								            [
 .9,
 								             "rgb(33,102,172)"
 								            ],
 								            [
 ,
 								             "rgb(5,48,97)"
 								            ]
 								           ],
 								           "sequential": [
 								            [
 ,
 								             "#440154"
 								            ],
 								            [
 .1111111111111111,
 								             "#482878"
 								            ],
 								            [
 .2222222222222222,
 								             "#3e4989"
 								            ],
 								            [
 .3333333333333333,
 								             "#31688e"
 								            ],
 								            [
 .4444444444444444,
 								             "#26828e"
 								            ],
 								            [
 .5555555555555556,
 								             "#1f9e89"
 								            ],
 								            [
 .6666666666666666,
 								             "#35b779"
 								            ],
 								            [
 .7777777777777778,
 								             "#6ece58"
 								            ],
 								            [
 .8888888888888888,
 								             "#b5de2b"
 								            ],
 								            [
 ,
 								             "#fde725"
 								            ]
 								           ],
 								           "sequentialminus": [
 								            [
 ,
 								             "#440154"
 								            ],
 								            [
 .1111111111111111,
 								             "#482878"
 								            ],
 								            [
 .2222222222222222,
 								             "#3e4989"
 								            ],
 								            [
 .3333333333333333,
 								             "#31688e"
 								            ],
 								            [
 .4444444444444444,
 								             "#26828e"
 								            ],
 								            [
 .5555555555555556,
 								             "#1f9e89"
 								            ],
 								            [
 .6666666666666666,
 								             "#35b779"
 								            ],
 								            [
 .7777777777777778,
 								             "#6ece58"
 								            ],
 								            [
 .8888888888888888,
 								             "#b5de2b"
 								            ],
 								            [
 ,
 								             "#fde725"
 								            ]
 								           ]
 								          },
 								          "colorway": [
 								           "#1F77B4",
 								           "#FF7F0E",
 								           "#2CA02C",
 								           "#D62728",
 								           "#9467BD",
 								           "#8C564B",
 								           "#E377C2",
 								           "#7F7F7F",
 								           "#BCBD22",
 								           "#17BECF"
 								          ],
 								          "font": {
 								           "color": "rgb(36,36,36)"
 								          },
 								          "geo": {
 								           "bgcolor": "white",
 								           "lakecolor": "white",
 								           "landcolor": "white",
 								           "showlakes": true,
 								           "showland": true,
 								           "subunitcolor": "white"
 								          },
 								          "hoverlabel": {
 								           "align": "left"
 								          },
 								          "hovermode": "closest",
 								          "mapbox": {
 								           "style": "light"
 								          },
 								          "paper_bgcolor": "white",
 								          "plot_bgcolor": "white",
 								          "polar": {
 								           "angularaxis": {
 								            "gridcolor": "rgb(232,232,232)",
 								            "linecolor": "rgb(36,36,36)",
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside"
 								           },
 								           "bgcolor": "white",
 								           "radialaxis": {
 								            "gridcolor": "rgb(232,232,232)",
 								            "linecolor": "rgb(36,36,36)",
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside"
 								           }
 								          },
 								          "scene": {
 								           "xaxis": {
 								            "backgroundcolor": "white",
 								            "gridcolor": "rgb(232,232,232)",
 								            "gridwidth": 2,
 								            "linecolor": "rgb(36,36,36)",
 								            "showbackground": true,
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside",
 								            "zeroline": false,
 								            "zerolinecolor": "rgb(36,36,36)"
 								           },
 								           "yaxis": {
 								            "backgroundcolor": "white",
 								            "gridcolor": "rgb(232,232,232)",
 								            "gridwidth": 2,
 								            "linecolor": "rgb(36,36,36)",
 								            "showbackground": true,
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside",
 								            "zeroline": false,
 								            "zerolinecolor": "rgb(36,36,36)"
 								           },
 								           "zaxis": {
 								            "backgroundcolor": "white",
 								            "gridcolor": "rgb(232,232,232)",
 								            "gridwidth": 2,
 								            "linecolor": "rgb(36,36,36)",
 								            "showbackground": true,
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside",
 								            "zeroline": false,
 								            "zerolinecolor": "rgb(36,36,36)"
 								           }
 								          },
 								          "shapedefaults": {
 								           "fillcolor": "black",
 								           "line": {
 								            "width": 0
 								           },
 								           "opacity": 0.3
 								          },
 								          "ternary": {
 								           "aaxis": {
 								            "gridcolor": "rgb(232,232,232)",
 								            "linecolor": "rgb(36,36,36)",
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside"
 								           },
 								           "baxis": {
 								            "gridcolor": "rgb(232,232,232)",
 								            "linecolor": "rgb(36,36,36)",
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside"
 								           },
 								           "bgcolor": "white",
 								           "caxis": {
 								            "gridcolor": "rgb(232,232,232)",
 								            "linecolor": "rgb(36,36,36)",
 								            "showgrid": false,
 								            "showline": true,
 								            "ticks": "outside"
 								           }
 								          },
 								          "title": {
 								           "x": 0.05
 								          },
 								          "xaxis": {
 								           "automargin": true,
 								           "gridcolor": "rgb(232,232,232)",
 								           "linecolor": "rgb(36,36,36)",
 								           "showgrid": false,
 								           "showline": true,
 								           "ticks": "outside",
 								           "title": {
 								            "standoff": 15
 								           },
 								           "zeroline": false,
 								           "zerolinecolor": "rgb(36,36,36)"
 								          },
 								          "yaxis": {
 								           "automargin": true,
 								           "gridcolor": "rgb(232,232,232)",
 								           "linecolor": "rgb(36,36,36)",
 								           "showgrid": false,
 								           "showline": true,
 								           "ticks": "outside",
 								           "title": {
 								            "standoff": 15
 								           },
 								           "zeroline": false,
 								           "zerolinecolor": "rgb(36,36,36)"
 								          }
 								         }
 								        },
 								        "title": {
 								         "font": {
 								          "color": "Black",
 								          "size": 22
 								         },
 								         "text": "<b>Intertopic Distance Map</b>",
 								         "x": 0.5,
 								         "xanchor": "center",
 								         "y": 0.95,
 								         "yanchor": "top"
 								        },
 								        "width": 650,
 								        "xaxis": {
 								         "anchor": "y",
 								         "domain": [
 ,
 
 								         ],
 								         "range": [
 .03216586112976,
 .973989295959473
 								         ],
 								         "title": {
 								          "text": ""
 								         },
 								         "visible": false
 								        },
 								        "yaxis": {
 								         "anchor": "x",
 								         "domain": [
 ,
 
 								         ],
 								         "range": [
 .4220046877861023,
 .437612867355346
 								         ],
 								         "title": {
 								          "text": ""
 								         },
 								         "visible": false
 								        }
 								       }
 								      }
 								     },
 								     "metadata": {},
 								     "output_type": "display_data"
 								    }
 								   ],
 								   "source": [
 								    "topic_model.visualize_topics()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": []
 								  }
 								 ],
 								 "metadata": {
 								  "kernelspec": {
 								   "display_name": "Python 3 (ipykernel)",
 								   "language": "python",
 								   "name": "python3"
 								  },
 								  "language_info": {
 								   "codemirror_mode": {
 								    "name": "ipython",
 								    "version": 3
 								   },
 								   "file_extension": ".py",
 								   "mimetype": "text/x-python",
 								   "name": "python",
 								   "nbconvert_exporter": "python",
 								   "pygments_lexer": "ipython3",
 								   "version": "3.8.15"
 								  }
 								 },
 								 "nbformat": 4,
 								 "nbformat_minor": 2
 								}