unstructured/examples/arxiv-topic-modelling/topic_model.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import General Use Packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import arxiv  # Interact with arXiv api to scrape papers\n",
    "from sentence_transformers import (\n",
    "    SentenceTransformer,\n",
    ")  # Use Hugging Face Embedding for Topic Modelling\n",
    "from bertopic import BERTopic  # Package for Topic Modelling\n",
    "from tqdm import tqdm  # Progress Bar When Iterating\n",
    "import glob  # Identify Files in Directory\n",
    "import os  # Delete Files in Directory\n",
    "import pandas as pd  # Dataframe Manipulation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unstructured.partition.auto import partition  # Base Function to Partition PDF\n",
    "from unstructured.staging.base import (\n",
    "    convert_to_dict,\n",
    ")  # Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n",
    "from unstructured.cleaners.core import (\n",
    "    clean,\n",
    "    remove_punctuation,\n",
    "    clean_non_ascii_chars,\n",
    ")  # Cleaning Functions\n",
    "import re  # Create Custom Cleaning Function\n",
    "import nltk  # Toolkit for more advanced pre-processing\n",
    "from nltk.corpus import stopwords  # list of stopwords to remove\n",
    "from typing import List  # Type Hinting"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup NLTK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/pravinsanthanam/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download(\"stopwords\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Function to Extract PDFs About Machine Learning from arXiv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n",
    "    \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n",
    "\n",
    "    Args:\n",
    "        query (str): query for arXiv API\n",
    "        max_results (int, optional): Number of Papers to get back. Defaults to 100.\n",
    "\n",
    "    Returns:\n",
    "        paper_texts (list[str]): Return list of narrative texts for each paper\n",
    "    \"\"\"\n",
    "    # Get List of Arxiv Papers Matching Our Query\n",
    "    arxiv_papers = list(\n",
    "        arxiv.Search(\n",
    "            query=query,\n",
    "            max_results=max_results,\n",
    "            sort_by=arxiv.SortCriterion.Relevance,\n",
    "            sort_order=arxiv.SortOrder.Descending,\n",
    "        ).results()\n",
    "    )\n",
    "\n",
    "    # Loop Through PDFs, Download and Pre-Process and Then Delete\n",
    "    paper_texts = []\n",
    "    for paper in tqdm(arxiv_papers):\n",
    "        paper.download_pdf()\n",
    "        pdf_file = glob.glob(\"*.pdf\")[0]\n",
    "        elements = partition(pdf_file)  # Partition PDF Using Unstructured\n",
    "        isd = convert_to_dict(elements)  # Convert List of Elements to List of Dictionaries\n",
    "        narrative_texts = [\n",
    "            element[\"text\"] for element in isd if element[\"type\"] == \"NarrativeText\"\n",
    "        ]  # Only Keep Narrative Text and Combine Into One String\n",
    "        os.remove(pdf_file)  # Delete PDF\n",
    "        paper_texts += narrative_texts\n",
    "    return paper_texts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n"
     ]
    }
   ],
   "source": [
    "paper_texts = get_arxiv_paper_texts(query=\"natural language processing\", max_results=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Narrative Texts Through Custom Cleaner Function Using Unstructured"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Narrative Texts to Run Through Topic Modelling: 1711\n"
     ]
    }
   ],
   "source": [
    "# Stopwords to Remove\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "\n",
    "\n",
    "# Function to Apply Whatever Cleaning Functionality to Each Narrative Text Element\n",
    "def custom_clean_function(narrative_text: str) -> str:\n",
    "    \"\"\"Apply Mix of Unstructured Cleaning Functions With Some Custom Functionality to Pre-Process Narrative Text\n",
    "\n",
    "    Args:\n",
    "        narrative_text (str): Narrative Text or Any Other Sentence\n",
    "\n",
    "    Returns:\n",
    "        cleaned_text (str): Text after going through all the cleaning procedures\n",
    "    \"\"\"\n",
    "    remove_numbers = lambda text: re.sub(\n",
    "        r\"\\d+\", \"\", text\n",
    "    )  # lambda function to remove all punctuation\n",
    "    cleaned_text = remove_numbers(narrative_text)  # Apply Custom Lambda\n",
    "    cleaned_text = clean(\n",
    "        cleaned_text,\n",
    "        extra_whitespace=True,\n",
    "        dashes=True,\n",
    "        bullets=True,\n",
    "        trailing_punctuation=True,\n",
    "        lowercase=True,\n",
    "    )  # Apply Basic Clean Function With All the Options\n",
    "    cleaned_text = remove_punctuation(cleaned_text)  # Remove all punctuation\n",
    "    cleaned_text = \" \".join(\n",
    "        [word for word in cleaned_text.split() if word not in stop_words]\n",
    "    )  # remove stop words\n",
    "    return cleaned_text\n",
    "\n",
    "\n",
    "# Apply Function to Paper Texts\n",
    "cleaned_paper_texts = [custom_clean_function(text) for text in paper_texts]\n",
    "\n",
    "# Count Narratve Texts\n",
    "print(\n",
    "    \"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts))\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Choose Which Hugging Face Model You Want to Use\n",
    "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
    "\n",
    "# Initialize Model\n",
    "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a6ebe3cb185049bd8d37742f2451cbe0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Batches:   0%|          | 0/54 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n",
      "2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n",
      "2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n",
      "2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n"
     ]
    }
   ],
   "source": [
    "# Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n",
    "topic_model.fit(cleaned_paper_texts)\n",
    "\n",
    "# Store Document-Topic Info\n",
    "doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n",
    "\n",
    "# Store Topic Info\n",
    "topic_info = pd.DataFrame(topic_model.get_topics())\n",
    "topic_info = topic_info.applymap(lambda x: x[0])\n",
    "topic_info.columns = [\"topic_{}\".format(col + 1) for col in topic_info.columns]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Checkout Keywords for Each Topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>topic_0</th>\n",
       "      <th>topic_1</th>\n",
       "      <th>topic_2</th>\n",
       "      <th>topic_3</th>\n",
       "      <th>topic_4</th>\n",
       "      <th>topic_5</th>\n",
       "      <th>topic_6</th>\n",
       "      <th>topic_7</th>\n",
       "      <th>topic_8</th>\n",
       "      <th>topic_9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>neural</td>\n",
       "      <td>language</td>\n",
       "      <td>state</td>\n",
       "      <td>function</td>\n",
       "      <td>cost</td>\n",
       "      <td>publication</td>\n",
       "      <td>graph</td>\n",
       "      <td>llama</td>\n",
       "      <td>tangkhul</td>\n",
       "      <td>want</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>network</td>\n",
       "      <td>natural</td>\n",
       "      <td>rnn</td>\n",
       "      <td>distribution</td>\n",
       "      <td>function</td>\n",
       "      <td>april</td>\n",
       "      <td>computation</td>\n",
       "      <td>like</td>\n",
       "      <td>compound</td>\n",
       "      <td>edu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>function</td>\n",
       "      <td>model</td>\n",
       "      <td>memory</td>\n",
       "      <td>output</td>\n",
       "      <td>sgd</td>\n",
       "      <td>syst</td>\n",
       "      <td>node</td>\n",
       "      <td>south</td>\n",
       "      <td>root</td>\n",
       "      <td>dsontagcoursesinferenceslidespseudolikelihoodn...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>networks</td>\n",
       "      <td>word</td>\n",
       "      <td>vector</td>\n",
       "      <td>class</td>\n",
       "      <td>training</td>\n",
       "      <td>technol</td>\n",
       "      <td>nodes</td>\n",
       "      <td>animal</td>\n",
       "      <td>morphological</td>\n",
       "      <td>regardlessly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>one</td>\n",
       "      <td>planning</td>\n",
       "      <td>input</td>\n",
       "      <td>tanh</td>\n",
       "      <td>expected</td>\n",
       "      <td>date</td>\n",
       "      <td>backward</td>\n",
       "      <td>america</td>\n",
       "      <td>verbs</td>\n",
       "      <td>satisfied</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>input</td>\n",
       "      <td>words</td>\n",
       "      <td>network</td>\n",
       "      <td>data</td>\n",
       "      <td>optimization</td>\n",
       "      <td>vol</td>\n",
       "      <td>function</td>\n",
       "      <td>translation</td>\n",
       "      <td>noun</td>\n",
       "      <td>november</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>vector</td>\n",
       "      <td>based</td>\n",
       "      <td>recurrent</td>\n",
       "      <td>yˆ</td>\n",
       "      <td>algorithm</td>\n",
       "      <td>intell</td>\n",
       "      <td>backpropagation</td>\n",
       "      <td>french</td>\n",
       "      <td>roots</td>\n",
       "      <td>tune</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>language</td>\n",
       "      <td>processing</td>\n",
       "      <td>sequence</td>\n",
       "      <td>loss</td>\n",
       "      <td>set</td>\n",
       "      <td>acm</td>\n",
       "      <td>algorithm</td>\n",
       "      <td>cute</td>\n",
       "      <td>adjectives</td>\n",
       "      <td>return</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>model</td>\n",
       "      <td>models</td>\n",
       "      <td>neural</td>\n",
       "      <td>activation</td>\n",
       "      <td>validation</td>\n",
       "      <td>article</td>\n",
       "      <td>parameters</td>\n",
       "      <td>google</td>\n",
       "      <td>formation</td>\n",
       "      <td>fully</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>training</td>\n",
       "      <td>data</td>\n",
       "      <td>lstm</td>\n",
       "      <td>softmax</td>\n",
       "      <td>rate</td>\n",
       "      <td>trans</td>\n",
       "      <td>output</td>\n",
       "      <td>domesticated</td>\n",
       "      <td>language</td>\n",
       "      <td>results</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    topic_0     topic_1    topic_2       topic_3       topic_4      topic_5  \\\n",
       "0    neural    language      state      function          cost  publication   \n",
       "1   network     natural        rnn  distribution      function        april   \n",
       "2  function       model     memory        output           sgd         syst   \n",
       "3  networks        word     vector         class      training      technol   \n",
       "4       one    planning      input          tanh      expected         date   \n",
       "5     input       words    network          data  optimization          vol   \n",
       "6    vector       based  recurrent            yˆ     algorithm       intell   \n",
       "7  language  processing   sequence          loss           set          acm   \n",
       "8     model      models     neural    activation    validation      article   \n",
       "9  training        data       lstm       softmax          rate        trans   \n",
       "\n",
       "           topic_6       topic_7        topic_8  \\\n",
       "0            graph         llama       tangkhul   \n",
       "1      computation          like       compound   \n",
       "2             node         south           root   \n",
       "3            nodes        animal  morphological   \n",
       "4         backward       america          verbs   \n",
       "5         function   translation           noun   \n",
       "6  backpropagation        french          roots   \n",
       "7        algorithm          cute     adjectives   \n",
       "8       parameters        google      formation   \n",
       "9           output  domesticated       language   \n",
       "\n",
       "                                             topic_9  \n",
       "0                                               want  \n",
       "1                                                edu  \n",
       "2  dsontagcoursesinferenceslidespseudolikelihoodn...  \n",
       "3                                       regardlessly  \n",
       "4                                          satisfied  \n",
       "5                                           november  \n",
       "6                                               tune  \n",
       "7                                             return  \n",
       "8                                              fully  \n",
       "9                                            results  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(topic_info)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize Topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "customdata": [
          [
           0,
           "language | natural | model | word | planning",
           723
          ],
          [
           1,
           "state | rnn | memory | vector | input",
           198
          ],
          [
           2,
           "function | distribution | output | class | tanh",
           122
          ],
          [
           3,
           "cost | function | sgd | training | expected",
           61
          ],
          [
           4,
           "publication | april | syst | technol | date",
           57
          ],
          [
           5,
           "graph | computation | node | nodes | backward",
           46
          ],
          [
           6,
           "llama | like | south | animal | america",
           29
          ],
          [
           7,
           "tangkhul | compound | root | morphological | verbs",
           17
          ],
          [
           8,
           "want | edu | dsontagcoursesinferenceslidespseudolikelihoodnotespdf | regardlessly | satisfied",
           13
          ]
         ],
         "hovertemplate": "<b>Topic %{customdata[0]}</b><br>%{customdata[1]}<br>Size: %{customdata[2]}",
         "legendgroup": "",
         "marker": {
          "color": "#B0BEC5",
          "line": {
           "color": "DarkSlateGrey",
           "width": 2
          },
          "size": [
           723,
           198,
           122,
           61,
           57,
           46,
           29,
           17,
           13
          ],
          "sizemode": "area",
          "sizeref": 0.451875,
          "symbol": "circle"
         },
         "mode": "markers",
         "name": "",
         "orientation": "v",
         "showlegend": false,
         "type": "scatter",
         "x": [
          14.759990692138672,
          14.329012870788574,
          10.99558162689209,
          9.891719818115234,
          11.191701889038086,
          9.449606895446777,
          11.662773132324219,
          14.039092063903809,
          12.023329734802246
         ],
         "xaxis": "x",
         "y": [
          1.6729466915130615,
          2.2927768230438232,
          5.36309289932251,
          5.59792423248291,
          4.721500873565674,
          5.3096089363098145,
          5.3371052742004395,
          1.8039934635162354,
          4.149565696716309
         ],
         "yaxis": "y"
        }
       ],
       "layout": {
        "annotations": [
         {
          "showarrow": false,
          "text": "D1",
          "x": 8.03216586112976,
          "y": 3.929808777570724,
          "yshift": 10
         },
         {
          "showarrow": false,
          "text": "D2",
          "x": 12.503077578544616,
          "xshift": 10,
          "y": 6.437612867355346
         }
        ],
        "height": 650,
        "hoverlabel": {
         "bgcolor": "white",
         "font": {
          "family": "Rockwell",
          "size": 16
         }
        },
        "legend": {
         "itemsizing": "constant",
         "tracegroupgap": 0
        },
        "margin": {
         "t": 60
        },
        "shapes": [
         {
          "line": {
           "color": "#CFD8DC",
           "width": 2
          },
          "type": "line",
          "x0": 12.503077578544616,
          "x1": 12.503077578544616,
          "y0": 1.4220046877861023,
          "y1": 6.437612867355346
         },
         {
          "line": {
           "color": "#9E9E9E",
           "width": 2
          },
          "type": "line",
          "x0": 8.03216586112976,
          "x1": 16.973989295959473,
          "y0": 3.929808777570724,
          "y1": 3.929808777570724
         }
        ],
        "sliders": [
         {
          "active": 0,
          "pad": {
           "t": 50
          },
          "steps": [
           {
            "args": [
             {
              "marker.color": [
               [
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 0",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 1",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 2",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 3",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 4",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 5",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 6",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red",
                "#B0BEC5"
               ]
              ]
             }
            ],
            "label": "Topic 7",
            "method": "update"
           },
           {
            "args": [
             {
              "marker.color": [
               [
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "#B0BEC5",
                "red"
               ]
              ]
             }
            ],
            "label": "Topic 8",
            "method": "update"
           }
          ]
         }
        ],
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "rgb(36,36,36)"
            },
            "error_y": {
             "color": "rgb(36,36,36)"
            },
            "marker": {
             "line": {
              "color": "white",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "white",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "rgb(36,36,36)",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "rgb(36,36,36)"
            },
            "baxis": {
             "endlinecolor": "rgb(36,36,36)",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "rgb(36,36,36)"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "heatmapgl": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "heatmapgl"
           }
          ],
          "histogram": [
           {
            "marker": {
             "line": {
              "color": "white",
              "width": 0.6
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 1,
              "tickcolor": "rgb(36,36,36)",
              "ticks": "outside"
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 1,
             "tickcolor": "rgb(36,36,36)",
             "ticks": "outside"
            },
            "colorscale": [
             [
              0,
              "#440154"
             ],
             [
              0.1111111111111111,
              "#482878"
             ],
             [
              0.2222222222222222,
              "#3e4989"
             ],
             [
              0.3333333333333333,
              "#31688e"
             ],
             [
              0.4444444444444444,
              "#26828e"
             ],
             [
              0.5555555555555556,
              "#1f9e89"
             ],
             [
              0.6666666666666666,
              "#35b779"
             ],
             [
              0.7777777777777778,
              "#6ece58"
             ],
             [
              0.8888888888888888,
              "#b5de2b"
             ],
             [
              1,
              "#fde725"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "rgb(237,237,237)"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "rgb(217,217,217)"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 1,
            "tickcolor": "rgb(36,36,36)",
            "ticks": "outside"
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "rgb(103,0,31)"
            ],
            [
             0.1,
             "rgb(178,24,43)"
            ],
            [
             0.2,
             "rgb(214,96,77)"
            ],
            [
             0.3,
             "rgb(244,165,130)"
            ],
            [
             0.4,
             "rgb(253,219,199)"
            ],
            [
             0.5,
             "rgb(247,247,247)"
            ],
            [
             0.6,
             "rgb(209,229,240)"
            ],
            [
             0.7,
             "rgb(146,197,222)"
            ],
            [
             0.8,
             "rgb(67,147,195)"
            ],
            [
             0.9,
             "rgb(33,102,172)"
            ],
            [
             1,
             "rgb(5,48,97)"
            ]
           ],
           "sequential": [
            [
             0,
             "#440154"
            ],
            [
             0.1111111111111111,
             "#482878"
            ],
            [
             0.2222222222222222,
             "#3e4989"
            ],
            [
             0.3333333333333333,
             "#31688e"
            ],
            [
             0.4444444444444444,
             "#26828e"
            ],
            [
             0.5555555555555556,
             "#1f9e89"
            ],
            [
             0.6666666666666666,
             "#35b779"
            ],
            [
             0.7777777777777778,
             "#6ece58"
            ],
            [
             0.8888888888888888,
             "#b5de2b"
            ],
            [
             1,
             "#fde725"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#440154"
            ],
            [
             0.1111111111111111,
             "#482878"
            ],
            [
             0.2222222222222222,
             "#3e4989"
            ],
            [
             0.3333333333333333,
             "#31688e"
            ],
            [
             0.4444444444444444,
             "#26828e"
            ],
            [
             0.5555555555555556,
             "#1f9e89"
            ],
            [
             0.6666666666666666,
             "#35b779"
            ],
            [
             0.7777777777777778,
             "#6ece58"
            ],
            [
             0.8888888888888888,
             "#b5de2b"
            ],
            [
             1,
             "#fde725"
            ]
           ]
          },
          "colorway": [
           "#1F77B4",
           "#FF7F0E",
           "#2CA02C",
           "#D62728",
           "#9467BD",
           "#8C564B",
           "#E377C2",
           "#7F7F7F",
           "#BCBD22",
           "#17BECF"
          ],
          "font": {
           "color": "rgb(36,36,36)"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "white",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "white",
          "polar": {
           "angularaxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           },
           "bgcolor": "white",
           "radialaxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "white",
            "gridcolor": "rgb(232,232,232)",
            "gridwidth": 2,
            "linecolor": "rgb(36,36,36)",
            "showbackground": true,
            "showgrid": false,
            "showline": true,
            "ticks": "outside",
            "zeroline": false,
            "zerolinecolor": "rgb(36,36,36)"
           },
           "yaxis": {
            "backgroundcolor": "white",
            "gridcolor": "rgb(232,232,232)",
            "gridwidth": 2,
            "linecolor": "rgb(36,36,36)",
            "showbackground": true,
            "showgrid": false,
            "showline": true,
            "ticks": "outside",
            "zeroline": false,
            "zerolinecolor": "rgb(36,36,36)"
           },
           "zaxis": {
            "backgroundcolor": "white",
            "gridcolor": "rgb(232,232,232)",
            "gridwidth": 2,
            "linecolor": "rgb(36,36,36)",
            "showbackground": true,
            "showgrid": false,
            "showline": true,
            "ticks": "outside",
            "zeroline": false,
            "zerolinecolor": "rgb(36,36,36)"
           }
          },
          "shapedefaults": {
           "fillcolor": "black",
           "line": {
            "width": 0
           },
           "opacity": 0.3
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           },
           "baxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           },
           "bgcolor": "white",
           "caxis": {
            "gridcolor": "rgb(232,232,232)",
            "linecolor": "rgb(36,36,36)",
            "showgrid": false,
            "showline": true,
            "ticks": "outside"
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "rgb(232,232,232)",
           "linecolor": "rgb(36,36,36)",
           "showgrid": false,
           "showline": true,
           "ticks": "outside",
           "title": {
            "standoff": 15
           },
           "zeroline": false,
           "zerolinecolor": "rgb(36,36,36)"
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "rgb(232,232,232)",
           "linecolor": "rgb(36,36,36)",
           "showgrid": false,
           "showline": true,
           "ticks": "outside",
           "title": {
            "standoff": 15
           },
           "zeroline": false,
           "zerolinecolor": "rgb(36,36,36)"
          }
         }
        },
        "title": {
         "font": {
          "color": "Black",
          "size": 22
         },
         "text": "<b>Intertopic Distance Map</b>",
         "x": 0.5,
         "xanchor": "center",
         "y": 0.95,
         "yanchor": "top"
        },
        "width": 650,
        "xaxis": {
         "anchor": "y",
         "domain": [
          0,
          1
         ],
         "range": [
          8.03216586112976,
          16.973989295959473
         ],
         "title": {
          "text": ""
         },
         "visible": false
        },
        "yaxis": {
         "anchor": "x",
         "domain": [
          0,
          1
         ],
         "range": [
          1.4220046877861023,
          6.437612867355346
         ],
         "title": {
          "text": ""
         },
         "visible": false
        }
       }
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "topic_model.visualize_topics()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}