mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	 d9c035edb1
			
		
	
	
		d9c035edb1
		
			
		
	
	
	
	
		
			
			### Summary We no longer use the "bricks" terminology for partioning functions, etc in the library. This PR updates various references to bricks within the repo and the docs. This is just an initial pass to swap the terminology out, it'll likely be helpful to reorganize the docs a bit as well. --------- Co-authored-by: qued <64741807+qued@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
		
			
				
	
	
		
			1847 lines
		
	
	
		
			47 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			1847 lines
		
	
	
		
			47 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | ||
|  "cells": [
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! "
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Import General Use Packages"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 1,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "import arxiv  # Interact with arXiv api to scrape papers\n",
 | ||
|     "from sentence_transformers import (\n",
 | ||
|     "    SentenceTransformer,\n",
 | ||
|     ")  # Use Hugging Face Embedding for Topic Modelling\n",
 | ||
|     "from bertopic import BERTopic  # Package for Topic Modelling\n",
 | ||
|     "from tqdm import tqdm  # Progress Bar When Iterating\n",
 | ||
|     "import glob  # Identify Files in Directory\n",
 | ||
|     "import os  # Delete Files in Directory\n",
 | ||
|     "import pandas as pd  # Dataframe Manipulation"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Functions"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 2,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "from unstructured.partition.auto import partition  # Base Function to Partition PDF\n",
 | ||
|     "from unstructured.staging.base import (\n",
 | ||
|     "    convert_to_dict,\n",
 | ||
|     ")  # Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n",
 | ||
|     "from unstructured.cleaners.core import (\n",
 | ||
|     "    clean,\n",
 | ||
|     "    remove_punctuation,\n",
 | ||
|     "    clean_non_ascii_chars,\n",
 | ||
|     ")  # Cleaning Functions\n",
 | ||
|     "import re  # Create Custom Cleaning Function\n",
 | ||
|     "import nltk  # Toolkit for more advanced pre-processing\n",
 | ||
|     "from nltk.corpus import stopwords  # list of stopwords to remove\n",
 | ||
|     "from typing import List  # Type Hinting"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Setup NLTK"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 3,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "[nltk_data] Downloading package stopwords to\n",
 | ||
|       "[nltk_data]     /Users/pravinsanthanam/nltk_data...\n",
 | ||
|       "[nltk_data]   Package stopwords is already up-to-date!\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "True"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 3,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "nltk.download(\"stopwords\")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Create Function to Extract PDFs About Machine Learning from arXiv"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 6,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n",
 | ||
|     "    \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n",
 | ||
|     "\n",
 | ||
|     "    Args:\n",
 | ||
|     "        query (str): query for arXiv API\n",
 | ||
|     "        max_results (int, optional): Number of Papers to get back. Defaults to 100.\n",
 | ||
|     "\n",
 | ||
|     "    Returns:\n",
 | ||
|     "        paper_texts (list[str]): Return list of narrative texts for each paper\n",
 | ||
|     "    \"\"\"\n",
 | ||
|     "    # Get List of Arxiv Papers Matching Our Query\n",
 | ||
|     "    arxiv_papers = list(\n",
 | ||
|     "        arxiv.Search(\n",
 | ||
|     "            query=query,\n",
 | ||
|     "            max_results=max_results,\n",
 | ||
|     "            sort_by=arxiv.SortCriterion.Relevance,\n",
 | ||
|     "            sort_order=arxiv.SortOrder.Descending,\n",
 | ||
|     "        ).results()\n",
 | ||
|     "    )\n",
 | ||
|     "\n",
 | ||
|     "    # Loop Through PDFs, Download and Pre-Process and Then Delete\n",
 | ||
|     "    paper_texts = []\n",
 | ||
|     "    for paper in tqdm(arxiv_papers):\n",
 | ||
|     "        paper.download_pdf()\n",
 | ||
|     "        pdf_file = glob.glob(\"*.pdf\")[0]\n",
 | ||
|     "        elements = partition(pdf_file)  # Partition PDF Using Unstructured\n",
 | ||
|     "        isd = convert_to_dict(elements)  # Convert List of Elements to List of Dictionaries\n",
 | ||
|     "        narrative_texts = [\n",
 | ||
|     "            element[\"text\"] for element in isd if element[\"type\"] == \"NarrativeText\"\n",
 | ||
|     "        ]  # Only Keep Narrative Text and Combine Into One String\n",
 | ||
|     "        os.remove(pdf_file)  # Delete PDF\n",
 | ||
|     "        paper_texts += narrative_texts\n",
 | ||
|     "    return paper_texts"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 7,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "paper_texts = get_arxiv_paper_texts(query=\"natural language processing\", max_results=10)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Run Narrative Texts Through Custom Cleaner Function Using Unstructured"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 8,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Number of Narrative Texts to Run Through Topic Modelling: 1711\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "# Stopwords to Remove\n",
 | ||
|     "stop_words = set(stopwords.words(\"english\"))\n",
 | ||
|     "\n",
 | ||
|     "\n",
 | ||
|     "# Function to Apply Whatever Cleaning Functionality to Each Narrative Text Element\n",
 | ||
|     "def custom_clean_function(narrative_text: str) -> str:\n",
 | ||
|     "    \"\"\"Apply Mix of Unstructured Cleaning Functions With Some Custom Functionality to Pre-Process Narrative Text\n",
 | ||
|     "\n",
 | ||
|     "    Args:\n",
 | ||
|     "        narrative_text (str): Narrative Text or Any Other Sentence\n",
 | ||
|     "\n",
 | ||
|     "    Returns:\n",
 | ||
|     "        cleaned_text (str): Text after going through all the cleaning procedures\n",
 | ||
|     "    \"\"\"\n",
 | ||
|     "    remove_numbers = lambda text: re.sub(\n",
 | ||
|     "        r\"\\d+\", \"\", text\n",
 | ||
|     "    )  # lambda function to remove all punctuation\n",
 | ||
|     "    cleaned_text = remove_numbers(narrative_text)  # Apply Custom Lambda\n",
 | ||
|     "    cleaned_text = clean(\n",
 | ||
|     "        cleaned_text,\n",
 | ||
|     "        extra_whitespace=True,\n",
 | ||
|     "        dashes=True,\n",
 | ||
|     "        bullets=True,\n",
 | ||
|     "        trailing_punctuation=True,\n",
 | ||
|     "        lowercase=True,\n",
 | ||
|     "    )  # Apply Basic Clean Function With All the Options\n",
 | ||
|     "    cleaned_text = remove_punctuation(cleaned_text)  # Remove all punctuation\n",
 | ||
|     "    cleaned_text = \" \".join(\n",
 | ||
|     "        [word for word in cleaned_text.split() if word not in stop_words]\n",
 | ||
|     "    )  # remove stop words\n",
 | ||
|     "    return cleaned_text\n",
 | ||
|     "\n",
 | ||
|     "\n",
 | ||
|     "# Apply Function to Paper Texts\n",
 | ||
|     "cleaned_paper_texts = [custom_clean_function(text) for text in paper_texts]\n",
 | ||
|     "\n",
 | ||
|     "# Count Narratve Texts\n",
 | ||
|     "print(\n",
 | ||
|     "    \"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts))\n",
 | ||
|     ")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 9,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "# Choose Which Hugging Face Model You Want to Use\n",
 | ||
|     "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
 | ||
|     "\n",
 | ||
|     "# Initialize Model\n",
 | ||
|     "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 10,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "application/vnd.jupyter.widget-view+json": {
 | ||
|        "model_id": "a6ebe3cb185049bd8d37742f2451cbe0",
 | ||
|        "version_major": 2,
 | ||
|        "version_minor": 0
 | ||
|       },
 | ||
|       "text/plain": [
 | ||
|        "Batches:   0%|          | 0/54 [00:00<?, ?it/s]"
 | ||
|       ]
 | ||
|      },
 | ||
|      "metadata": {},
 | ||
|      "output_type": "display_data"
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n",
 | ||
|       "2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n",
 | ||
|       "2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n",
 | ||
|       "2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "# Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n",
 | ||
|     "topic_model.fit(cleaned_paper_texts)\n",
 | ||
|     "\n",
 | ||
|     "# Store Document-Topic Info\n",
 | ||
|     "doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n",
 | ||
|     "\n",
 | ||
|     "# Store Topic Info\n",
 | ||
|     "topic_info = pd.DataFrame(topic_model.get_topics())\n",
 | ||
|     "topic_info = topic_info.applymap(lambda x: x[0])\n",
 | ||
|     "topic_info.columns = [\"topic_{}\".format(col + 1) for col in topic_info.columns]"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Checkout Keywords for Each Topic"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 11,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/html": [
 | ||
|        "<div>\n",
 | ||
|        "<style scoped>\n",
 | ||
|        "    .dataframe tbody tr th:only-of-type {\n",
 | ||
|        "        vertical-align: middle;\n",
 | ||
|        "    }\n",
 | ||
|        "\n",
 | ||
|        "    .dataframe tbody tr th {\n",
 | ||
|        "        vertical-align: top;\n",
 | ||
|        "    }\n",
 | ||
|        "\n",
 | ||
|        "    .dataframe thead th {\n",
 | ||
|        "        text-align: right;\n",
 | ||
|        "    }\n",
 | ||
|        "</style>\n",
 | ||
|        "<table border=\"1\" class=\"dataframe\">\n",
 | ||
|        "  <thead>\n",
 | ||
|        "    <tr style=\"text-align: right;\">\n",
 | ||
|        "      <th></th>\n",
 | ||
|        "      <th>topic_0</th>\n",
 | ||
|        "      <th>topic_1</th>\n",
 | ||
|        "      <th>topic_2</th>\n",
 | ||
|        "      <th>topic_3</th>\n",
 | ||
|        "      <th>topic_4</th>\n",
 | ||
|        "      <th>topic_5</th>\n",
 | ||
|        "      <th>topic_6</th>\n",
 | ||
|        "      <th>topic_7</th>\n",
 | ||
|        "      <th>topic_8</th>\n",
 | ||
|        "      <th>topic_9</th>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "  </thead>\n",
 | ||
|        "  <tbody>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>0</th>\n",
 | ||
|        "      <td>neural</td>\n",
 | ||
|        "      <td>language</td>\n",
 | ||
|        "      <td>state</td>\n",
 | ||
|        "      <td>function</td>\n",
 | ||
|        "      <td>cost</td>\n",
 | ||
|        "      <td>publication</td>\n",
 | ||
|        "      <td>graph</td>\n",
 | ||
|        "      <td>llama</td>\n",
 | ||
|        "      <td>tangkhul</td>\n",
 | ||
|        "      <td>want</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>1</th>\n",
 | ||
|        "      <td>network</td>\n",
 | ||
|        "      <td>natural</td>\n",
 | ||
|        "      <td>rnn</td>\n",
 | ||
|        "      <td>distribution</td>\n",
 | ||
|        "      <td>function</td>\n",
 | ||
|        "      <td>april</td>\n",
 | ||
|        "      <td>computation</td>\n",
 | ||
|        "      <td>like</td>\n",
 | ||
|        "      <td>compound</td>\n",
 | ||
|        "      <td>edu</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>2</th>\n",
 | ||
|        "      <td>function</td>\n",
 | ||
|        "      <td>model</td>\n",
 | ||
|        "      <td>memory</td>\n",
 | ||
|        "      <td>output</td>\n",
 | ||
|        "      <td>sgd</td>\n",
 | ||
|        "      <td>syst</td>\n",
 | ||
|        "      <td>node</td>\n",
 | ||
|        "      <td>south</td>\n",
 | ||
|        "      <td>root</td>\n",
 | ||
|        "      <td>dsontagcoursesinferenceslidespseudolikelihoodn...</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>3</th>\n",
 | ||
|        "      <td>networks</td>\n",
 | ||
|        "      <td>word</td>\n",
 | ||
|        "      <td>vector</td>\n",
 | ||
|        "      <td>class</td>\n",
 | ||
|        "      <td>training</td>\n",
 | ||
|        "      <td>technol</td>\n",
 | ||
|        "      <td>nodes</td>\n",
 | ||
|        "      <td>animal</td>\n",
 | ||
|        "      <td>morphological</td>\n",
 | ||
|        "      <td>regardlessly</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>4</th>\n",
 | ||
|        "      <td>one</td>\n",
 | ||
|        "      <td>planning</td>\n",
 | ||
|        "      <td>input</td>\n",
 | ||
|        "      <td>tanh</td>\n",
 | ||
|        "      <td>expected</td>\n",
 | ||
|        "      <td>date</td>\n",
 | ||
|        "      <td>backward</td>\n",
 | ||
|        "      <td>america</td>\n",
 | ||
|        "      <td>verbs</td>\n",
 | ||
|        "      <td>satisfied</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>5</th>\n",
 | ||
|        "      <td>input</td>\n",
 | ||
|        "      <td>words</td>\n",
 | ||
|        "      <td>network</td>\n",
 | ||
|        "      <td>data</td>\n",
 | ||
|        "      <td>optimization</td>\n",
 | ||
|        "      <td>vol</td>\n",
 | ||
|        "      <td>function</td>\n",
 | ||
|        "      <td>translation</td>\n",
 | ||
|        "      <td>noun</td>\n",
 | ||
|        "      <td>november</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>6</th>\n",
 | ||
|        "      <td>vector</td>\n",
 | ||
|        "      <td>based</td>\n",
 | ||
|        "      <td>recurrent</td>\n",
 | ||
|        "      <td>yˆ</td>\n",
 | ||
|        "      <td>algorithm</td>\n",
 | ||
|        "      <td>intell</td>\n",
 | ||
|        "      <td>backpropagation</td>\n",
 | ||
|        "      <td>french</td>\n",
 | ||
|        "      <td>roots</td>\n",
 | ||
|        "      <td>tune</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>7</th>\n",
 | ||
|        "      <td>language</td>\n",
 | ||
|        "      <td>processing</td>\n",
 | ||
|        "      <td>sequence</td>\n",
 | ||
|        "      <td>loss</td>\n",
 | ||
|        "      <td>set</td>\n",
 | ||
|        "      <td>acm</td>\n",
 | ||
|        "      <td>algorithm</td>\n",
 | ||
|        "      <td>cute</td>\n",
 | ||
|        "      <td>adjectives</td>\n",
 | ||
|        "      <td>return</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>8</th>\n",
 | ||
|        "      <td>model</td>\n",
 | ||
|        "      <td>models</td>\n",
 | ||
|        "      <td>neural</td>\n",
 | ||
|        "      <td>activation</td>\n",
 | ||
|        "      <td>validation</td>\n",
 | ||
|        "      <td>article</td>\n",
 | ||
|        "      <td>parameters</td>\n",
 | ||
|        "      <td>google</td>\n",
 | ||
|        "      <td>formation</td>\n",
 | ||
|        "      <td>fully</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "    <tr>\n",
 | ||
|        "      <th>9</th>\n",
 | ||
|        "      <td>training</td>\n",
 | ||
|        "      <td>data</td>\n",
 | ||
|        "      <td>lstm</td>\n",
 | ||
|        "      <td>softmax</td>\n",
 | ||
|        "      <td>rate</td>\n",
 | ||
|        "      <td>trans</td>\n",
 | ||
|        "      <td>output</td>\n",
 | ||
|        "      <td>domesticated</td>\n",
 | ||
|        "      <td>language</td>\n",
 | ||
|        "      <td>results</td>\n",
 | ||
|        "    </tr>\n",
 | ||
|        "  </tbody>\n",
 | ||
|        "</table>\n",
 | ||
|        "</div>"
 | ||
|       ],
 | ||
|       "text/plain": [
 | ||
|        "    topic_0     topic_1    topic_2       topic_3       topic_4      topic_5  \\\n",
 | ||
|        "0    neural    language      state      function          cost  publication   \n",
 | ||
|        "1   network     natural        rnn  distribution      function        april   \n",
 | ||
|        "2  function       model     memory        output           sgd         syst   \n",
 | ||
|        "3  networks        word     vector         class      training      technol   \n",
 | ||
|        "4       one    planning      input          tanh      expected         date   \n",
 | ||
|        "5     input       words    network          data  optimization          vol   \n",
 | ||
|        "6    vector       based  recurrent            yˆ     algorithm       intell   \n",
 | ||
|        "7  language  processing   sequence          loss           set          acm   \n",
 | ||
|        "8     model      models     neural    activation    validation      article   \n",
 | ||
|        "9  training        data       lstm       softmax          rate        trans   \n",
 | ||
|        "\n",
 | ||
|        "           topic_6       topic_7        topic_8  \\\n",
 | ||
|        "0            graph         llama       tangkhul   \n",
 | ||
|        "1      computation          like       compound   \n",
 | ||
|        "2             node         south           root   \n",
 | ||
|        "3            nodes        animal  morphological   \n",
 | ||
|        "4         backward       america          verbs   \n",
 | ||
|        "5         function   translation           noun   \n",
 | ||
|        "6  backpropagation        french          roots   \n",
 | ||
|        "7        algorithm          cute     adjectives   \n",
 | ||
|        "8       parameters        google      formation   \n",
 | ||
|        "9           output  domesticated       language   \n",
 | ||
|        "\n",
 | ||
|        "                                             topic_9  \n",
 | ||
|        "0                                               want  \n",
 | ||
|        "1                                                edu  \n",
 | ||
|        "2  dsontagcoursesinferenceslidespseudolikelihoodn...  \n",
 | ||
|        "3                                       regardlessly  \n",
 | ||
|        "4                                          satisfied  \n",
 | ||
|        "5                                           november  \n",
 | ||
|        "6                                               tune  \n",
 | ||
|        "7                                             return  \n",
 | ||
|        "8                                              fully  \n",
 | ||
|        "9                                            results  "
 | ||
|       ]
 | ||
|      },
 | ||
|      "metadata": {},
 | ||
|      "output_type": "display_data"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "display(topic_info)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Visualize Topics"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 13,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "application/vnd.plotly.v1+json": {
 | ||
|        "config": {
 | ||
|         "plotlyServerURL": "https://plot.ly"
 | ||
|        },
 | ||
|        "data": [
 | ||
|         {
 | ||
|          "customdata": [
 | ||
|           [
 | ||
|            0,
 | ||
|            "language | natural | model | word | planning",
 | ||
|            723
 | ||
|           ],
 | ||
|           [
 | ||
|            1,
 | ||
|            "state | rnn | memory | vector | input",
 | ||
|            198
 | ||
|           ],
 | ||
|           [
 | ||
|            2,
 | ||
|            "function | distribution | output | class | tanh",
 | ||
|            122
 | ||
|           ],
 | ||
|           [
 | ||
|            3,
 | ||
|            "cost | function | sgd | training | expected",
 | ||
|            61
 | ||
|           ],
 | ||
|           [
 | ||
|            4,
 | ||
|            "publication | april | syst | technol | date",
 | ||
|            57
 | ||
|           ],
 | ||
|           [
 | ||
|            5,
 | ||
|            "graph | computation | node | nodes | backward",
 | ||
|            46
 | ||
|           ],
 | ||
|           [
 | ||
|            6,
 | ||
|            "llama | like | south | animal | america",
 | ||
|            29
 | ||
|           ],
 | ||
|           [
 | ||
|            7,
 | ||
|            "tangkhul | compound | root | morphological | verbs",
 | ||
|            17
 | ||
|           ],
 | ||
|           [
 | ||
|            8,
 | ||
|            "want | edu | dsontagcoursesinferenceslidespseudolikelihoodnotespdf | regardlessly | satisfied",
 | ||
|            13
 | ||
|           ]
 | ||
|          ],
 | ||
|          "hovertemplate": "<b>Topic %{customdata[0]}</b><br>%{customdata[1]}<br>Size: %{customdata[2]}",
 | ||
|          "legendgroup": "",
 | ||
|          "marker": {
 | ||
|           "color": "#B0BEC5",
 | ||
|           "line": {
 | ||
|            "color": "DarkSlateGrey",
 | ||
|            "width": 2
 | ||
|           },
 | ||
|           "size": [
 | ||
|            723,
 | ||
|            198,
 | ||
|            122,
 | ||
|            61,
 | ||
|            57,
 | ||
|            46,
 | ||
|            29,
 | ||
|            17,
 | ||
|            13
 | ||
|           ],
 | ||
|           "sizemode": "area",
 | ||
|           "sizeref": 0.451875,
 | ||
|           "symbol": "circle"
 | ||
|          },
 | ||
|          "mode": "markers",
 | ||
|          "name": "",
 | ||
|          "orientation": "v",
 | ||
|          "showlegend": false,
 | ||
|          "type": "scatter",
 | ||
|          "x": [
 | ||
|           14.759990692138672,
 | ||
|           14.329012870788574,
 | ||
|           10.99558162689209,
 | ||
|           9.891719818115234,
 | ||
|           11.191701889038086,
 | ||
|           9.449606895446777,
 | ||
|           11.662773132324219,
 | ||
|           14.039092063903809,
 | ||
|           12.023329734802246
 | ||
|          ],
 | ||
|          "xaxis": "x",
 | ||
|          "y": [
 | ||
|           1.6729466915130615,
 | ||
|           2.2927768230438232,
 | ||
|           5.36309289932251,
 | ||
|           5.59792423248291,
 | ||
|           4.721500873565674,
 | ||
|           5.3096089363098145,
 | ||
|           5.3371052742004395,
 | ||
|           1.8039934635162354,
 | ||
|           4.149565696716309
 | ||
|          ],
 | ||
|          "yaxis": "y"
 | ||
|         }
 | ||
|        ],
 | ||
|        "layout": {
 | ||
|         "annotations": [
 | ||
|          {
 | ||
|           "showarrow": false,
 | ||
|           "text": "D1",
 | ||
|           "x": 8.03216586112976,
 | ||
|           "y": 3.929808777570724,
 | ||
|           "yshift": 10
 | ||
|          },
 | ||
|          {
 | ||
|           "showarrow": false,
 | ||
|           "text": "D2",
 | ||
|           "x": 12.503077578544616,
 | ||
|           "xshift": 10,
 | ||
|           "y": 6.437612867355346
 | ||
|          }
 | ||
|         ],
 | ||
|         "height": 650,
 | ||
|         "hoverlabel": {
 | ||
|          "bgcolor": "white",
 | ||
|          "font": {
 | ||
|           "family": "Rockwell",
 | ||
|           "size": 16
 | ||
|          }
 | ||
|         },
 | ||
|         "legend": {
 | ||
|          "itemsizing": "constant",
 | ||
|          "tracegroupgap": 0
 | ||
|         },
 | ||
|         "margin": {
 | ||
|          "t": 60
 | ||
|         },
 | ||
|         "shapes": [
 | ||
|          {
 | ||
|           "line": {
 | ||
|            "color": "#CFD8DC",
 | ||
|            "width": 2
 | ||
|           },
 | ||
|           "type": "line",
 | ||
|           "x0": 12.503077578544616,
 | ||
|           "x1": 12.503077578544616,
 | ||
|           "y0": 1.4220046877861023,
 | ||
|           "y1": 6.437612867355346
 | ||
|          },
 | ||
|          {
 | ||
|           "line": {
 | ||
|            "color": "#9E9E9E",
 | ||
|            "width": 2
 | ||
|           },
 | ||
|           "type": "line",
 | ||
|           "x0": 8.03216586112976,
 | ||
|           "x1": 16.973989295959473,
 | ||
|           "y0": 3.929808777570724,
 | ||
|           "y1": 3.929808777570724
 | ||
|          }
 | ||
|         ],
 | ||
|         "sliders": [
 | ||
|          {
 | ||
|           "active": 0,
 | ||
|           "pad": {
 | ||
|            "t": 50
 | ||
|           },
 | ||
|           "steps": [
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "red",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 0",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "red",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 1",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "red",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 2",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "red",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 3",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "red",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 4",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "red",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 5",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "red",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 6",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "red",
 | ||
|                 "#B0BEC5"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 7",
 | ||
|             "method": "update"
 | ||
|            },
 | ||
|            {
 | ||
|             "args": [
 | ||
|              {
 | ||
|               "marker.color": [
 | ||
|                [
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "#B0BEC5",
 | ||
|                 "red"
 | ||
|                ]
 | ||
|               ]
 | ||
|              }
 | ||
|             ],
 | ||
|             "label": "Topic 8",
 | ||
|             "method": "update"
 | ||
|            }
 | ||
|           ]
 | ||
|          }
 | ||
|         ],
 | ||
|         "template": {
 | ||
|          "data": {
 | ||
|           "bar": [
 | ||
|            {
 | ||
|             "error_x": {
 | ||
|              "color": "rgb(36,36,36)"
 | ||
|             },
 | ||
|             "error_y": {
 | ||
|              "color": "rgb(36,36,36)"
 | ||
|             },
 | ||
|             "marker": {
 | ||
|              "line": {
 | ||
|               "color": "white",
 | ||
|               "width": 0.5
 | ||
|              },
 | ||
|              "pattern": {
 | ||
|               "fillmode": "overlay",
 | ||
|               "size": 10,
 | ||
|               "solidity": 0.2
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "bar"
 | ||
|            }
 | ||
|           ],
 | ||
|           "barpolar": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "line": {
 | ||
|               "color": "white",
 | ||
|               "width": 0.5
 | ||
|              },
 | ||
|              "pattern": {
 | ||
|               "fillmode": "overlay",
 | ||
|               "size": 10,
 | ||
|               "solidity": 0.2
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "barpolar"
 | ||
|            }
 | ||
|           ],
 | ||
|           "carpet": [
 | ||
|            {
 | ||
|             "aaxis": {
 | ||
|              "endlinecolor": "rgb(36,36,36)",
 | ||
|              "gridcolor": "white",
 | ||
|              "linecolor": "white",
 | ||
|              "minorgridcolor": "white",
 | ||
|              "startlinecolor": "rgb(36,36,36)"
 | ||
|             },
 | ||
|             "baxis": {
 | ||
|              "endlinecolor": "rgb(36,36,36)",
 | ||
|              "gridcolor": "white",
 | ||
|              "linecolor": "white",
 | ||
|              "minorgridcolor": "white",
 | ||
|              "startlinecolor": "rgb(36,36,36)"
 | ||
|             },
 | ||
|             "type": "carpet"
 | ||
|            }
 | ||
|           ],
 | ||
|           "choropleth": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "type": "choropleth"
 | ||
|            }
 | ||
|           ],
 | ||
|           "contour": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "colorscale": [
 | ||
|              [
 | ||
|               0,
 | ||
|               "#440154"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.1111111111111111,
 | ||
|               "#482878"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.2222222222222222,
 | ||
|               "#3e4989"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.3333333333333333,
 | ||
|               "#31688e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.4444444444444444,
 | ||
|               "#26828e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.5555555555555556,
 | ||
|               "#1f9e89"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.6666666666666666,
 | ||
|               "#35b779"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.7777777777777778,
 | ||
|               "#6ece58"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.8888888888888888,
 | ||
|               "#b5de2b"
 | ||
|              ],
 | ||
|              [
 | ||
|               1,
 | ||
|               "#fde725"
 | ||
|              ]
 | ||
|             ],
 | ||
|             "type": "contour"
 | ||
|            }
 | ||
|           ],
 | ||
|           "contourcarpet": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "type": "contourcarpet"
 | ||
|            }
 | ||
|           ],
 | ||
|           "heatmap": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "colorscale": [
 | ||
|              [
 | ||
|               0,
 | ||
|               "#440154"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.1111111111111111,
 | ||
|               "#482878"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.2222222222222222,
 | ||
|               "#3e4989"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.3333333333333333,
 | ||
|               "#31688e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.4444444444444444,
 | ||
|               "#26828e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.5555555555555556,
 | ||
|               "#1f9e89"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.6666666666666666,
 | ||
|               "#35b779"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.7777777777777778,
 | ||
|               "#6ece58"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.8888888888888888,
 | ||
|               "#b5de2b"
 | ||
|              ],
 | ||
|              [
 | ||
|               1,
 | ||
|               "#fde725"
 | ||
|              ]
 | ||
|             ],
 | ||
|             "type": "heatmap"
 | ||
|            }
 | ||
|           ],
 | ||
|           "heatmapgl": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "colorscale": [
 | ||
|              [
 | ||
|               0,
 | ||
|               "#440154"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.1111111111111111,
 | ||
|               "#482878"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.2222222222222222,
 | ||
|               "#3e4989"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.3333333333333333,
 | ||
|               "#31688e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.4444444444444444,
 | ||
|               "#26828e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.5555555555555556,
 | ||
|               "#1f9e89"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.6666666666666666,
 | ||
|               "#35b779"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.7777777777777778,
 | ||
|               "#6ece58"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.8888888888888888,
 | ||
|               "#b5de2b"
 | ||
|              ],
 | ||
|              [
 | ||
|               1,
 | ||
|               "#fde725"
 | ||
|              ]
 | ||
|             ],
 | ||
|             "type": "heatmapgl"
 | ||
|            }
 | ||
|           ],
 | ||
|           "histogram": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "line": {
 | ||
|               "color": "white",
 | ||
|               "width": 0.6
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "histogram"
 | ||
|            }
 | ||
|           ],
 | ||
|           "histogram2d": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "colorscale": [
 | ||
|              [
 | ||
|               0,
 | ||
|               "#440154"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.1111111111111111,
 | ||
|               "#482878"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.2222222222222222,
 | ||
|               "#3e4989"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.3333333333333333,
 | ||
|               "#31688e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.4444444444444444,
 | ||
|               "#26828e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.5555555555555556,
 | ||
|               "#1f9e89"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.6666666666666666,
 | ||
|               "#35b779"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.7777777777777778,
 | ||
|               "#6ece58"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.8888888888888888,
 | ||
|               "#b5de2b"
 | ||
|              ],
 | ||
|              [
 | ||
|               1,
 | ||
|               "#fde725"
 | ||
|              ]
 | ||
|             ],
 | ||
|             "type": "histogram2d"
 | ||
|            }
 | ||
|           ],
 | ||
|           "histogram2dcontour": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "colorscale": [
 | ||
|              [
 | ||
|               0,
 | ||
|               "#440154"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.1111111111111111,
 | ||
|               "#482878"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.2222222222222222,
 | ||
|               "#3e4989"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.3333333333333333,
 | ||
|               "#31688e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.4444444444444444,
 | ||
|               "#26828e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.5555555555555556,
 | ||
|               "#1f9e89"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.6666666666666666,
 | ||
|               "#35b779"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.7777777777777778,
 | ||
|               "#6ece58"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.8888888888888888,
 | ||
|               "#b5de2b"
 | ||
|              ],
 | ||
|              [
 | ||
|               1,
 | ||
|               "#fde725"
 | ||
|              ]
 | ||
|             ],
 | ||
|             "type": "histogram2dcontour"
 | ||
|            }
 | ||
|           ],
 | ||
|           "mesh3d": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "type": "mesh3d"
 | ||
|            }
 | ||
|           ],
 | ||
|           "parcoords": [
 | ||
|            {
 | ||
|             "line": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "parcoords"
 | ||
|            }
 | ||
|           ],
 | ||
|           "pie": [
 | ||
|            {
 | ||
|             "automargin": true,
 | ||
|             "type": "pie"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scatter": [
 | ||
|            {
 | ||
|             "fillpattern": {
 | ||
|              "fillmode": "overlay",
 | ||
|              "size": 10,
 | ||
|              "solidity": 0.2
 | ||
|             },
 | ||
|             "type": "scatter"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scatter3d": [
 | ||
|            {
 | ||
|             "line": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scatter3d"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scattercarpet": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scattercarpet"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scattergeo": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scattergeo"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scattergl": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scattergl"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scattermapbox": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scattermapbox"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scatterpolar": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scatterpolar"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scatterpolargl": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scatterpolargl"
 | ||
|            }
 | ||
|           ],
 | ||
|           "scatterternary": [
 | ||
|            {
 | ||
|             "marker": {
 | ||
|              "colorbar": {
 | ||
|               "outlinewidth": 1,
 | ||
|               "tickcolor": "rgb(36,36,36)",
 | ||
|               "ticks": "outside"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "scatterternary"
 | ||
|            }
 | ||
|           ],
 | ||
|           "surface": [
 | ||
|            {
 | ||
|             "colorbar": {
 | ||
|              "outlinewidth": 1,
 | ||
|              "tickcolor": "rgb(36,36,36)",
 | ||
|              "ticks": "outside"
 | ||
|             },
 | ||
|             "colorscale": [
 | ||
|              [
 | ||
|               0,
 | ||
|               "#440154"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.1111111111111111,
 | ||
|               "#482878"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.2222222222222222,
 | ||
|               "#3e4989"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.3333333333333333,
 | ||
|               "#31688e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.4444444444444444,
 | ||
|               "#26828e"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.5555555555555556,
 | ||
|               "#1f9e89"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.6666666666666666,
 | ||
|               "#35b779"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.7777777777777778,
 | ||
|               "#6ece58"
 | ||
|              ],
 | ||
|              [
 | ||
|               0.8888888888888888,
 | ||
|               "#b5de2b"
 | ||
|              ],
 | ||
|              [
 | ||
|               1,
 | ||
|               "#fde725"
 | ||
|              ]
 | ||
|             ],
 | ||
|             "type": "surface"
 | ||
|            }
 | ||
|           ],
 | ||
|           "table": [
 | ||
|            {
 | ||
|             "cells": {
 | ||
|              "fill": {
 | ||
|               "color": "rgb(237,237,237)"
 | ||
|              },
 | ||
|              "line": {
 | ||
|               "color": "white"
 | ||
|              }
 | ||
|             },
 | ||
|             "header": {
 | ||
|              "fill": {
 | ||
|               "color": "rgb(217,217,217)"
 | ||
|              },
 | ||
|              "line": {
 | ||
|               "color": "white"
 | ||
|              }
 | ||
|             },
 | ||
|             "type": "table"
 | ||
|            }
 | ||
|           ]
 | ||
|          },
 | ||
|          "layout": {
 | ||
|           "annotationdefaults": {
 | ||
|            "arrowhead": 0,
 | ||
|            "arrowwidth": 1
 | ||
|           },
 | ||
|           "autotypenumbers": "strict",
 | ||
|           "coloraxis": {
 | ||
|            "colorbar": {
 | ||
|             "outlinewidth": 1,
 | ||
|             "tickcolor": "rgb(36,36,36)",
 | ||
|             "ticks": "outside"
 | ||
|            }
 | ||
|           },
 | ||
|           "colorscale": {
 | ||
|            "diverging": [
 | ||
|             [
 | ||
|              0,
 | ||
|              "rgb(103,0,31)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.1,
 | ||
|              "rgb(178,24,43)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.2,
 | ||
|              "rgb(214,96,77)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.3,
 | ||
|              "rgb(244,165,130)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.4,
 | ||
|              "rgb(253,219,199)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.5,
 | ||
|              "rgb(247,247,247)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.6,
 | ||
|              "rgb(209,229,240)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.7,
 | ||
|              "rgb(146,197,222)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.8,
 | ||
|              "rgb(67,147,195)"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.9,
 | ||
|              "rgb(33,102,172)"
 | ||
|             ],
 | ||
|             [
 | ||
|              1,
 | ||
|              "rgb(5,48,97)"
 | ||
|             ]
 | ||
|            ],
 | ||
|            "sequential": [
 | ||
|             [
 | ||
|              0,
 | ||
|              "#440154"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.1111111111111111,
 | ||
|              "#482878"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.2222222222222222,
 | ||
|              "#3e4989"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.3333333333333333,
 | ||
|              "#31688e"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.4444444444444444,
 | ||
|              "#26828e"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.5555555555555556,
 | ||
|              "#1f9e89"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.6666666666666666,
 | ||
|              "#35b779"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.7777777777777778,
 | ||
|              "#6ece58"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.8888888888888888,
 | ||
|              "#b5de2b"
 | ||
|             ],
 | ||
|             [
 | ||
|              1,
 | ||
|              "#fde725"
 | ||
|             ]
 | ||
|            ],
 | ||
|            "sequentialminus": [
 | ||
|             [
 | ||
|              0,
 | ||
|              "#440154"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.1111111111111111,
 | ||
|              "#482878"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.2222222222222222,
 | ||
|              "#3e4989"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.3333333333333333,
 | ||
|              "#31688e"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.4444444444444444,
 | ||
|              "#26828e"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.5555555555555556,
 | ||
|              "#1f9e89"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.6666666666666666,
 | ||
|              "#35b779"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.7777777777777778,
 | ||
|              "#6ece58"
 | ||
|             ],
 | ||
|             [
 | ||
|              0.8888888888888888,
 | ||
|              "#b5de2b"
 | ||
|             ],
 | ||
|             [
 | ||
|              1,
 | ||
|              "#fde725"
 | ||
|             ]
 | ||
|            ]
 | ||
|           },
 | ||
|           "colorway": [
 | ||
|            "#1F77B4",
 | ||
|            "#FF7F0E",
 | ||
|            "#2CA02C",
 | ||
|            "#D62728",
 | ||
|            "#9467BD",
 | ||
|            "#8C564B",
 | ||
|            "#E377C2",
 | ||
|            "#7F7F7F",
 | ||
|            "#BCBD22",
 | ||
|            "#17BECF"
 | ||
|           ],
 | ||
|           "font": {
 | ||
|            "color": "rgb(36,36,36)"
 | ||
|           },
 | ||
|           "geo": {
 | ||
|            "bgcolor": "white",
 | ||
|            "lakecolor": "white",
 | ||
|            "landcolor": "white",
 | ||
|            "showlakes": true,
 | ||
|            "showland": true,
 | ||
|            "subunitcolor": "white"
 | ||
|           },
 | ||
|           "hoverlabel": {
 | ||
|            "align": "left"
 | ||
|           },
 | ||
|           "hovermode": "closest",
 | ||
|           "mapbox": {
 | ||
|            "style": "light"
 | ||
|           },
 | ||
|           "paper_bgcolor": "white",
 | ||
|           "plot_bgcolor": "white",
 | ||
|           "polar": {
 | ||
|            "angularaxis": {
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside"
 | ||
|            },
 | ||
|            "bgcolor": "white",
 | ||
|            "radialaxis": {
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside"
 | ||
|            }
 | ||
|           },
 | ||
|           "scene": {
 | ||
|            "xaxis": {
 | ||
|             "backgroundcolor": "white",
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "gridwidth": 2,
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showbackground": true,
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside",
 | ||
|             "zeroline": false,
 | ||
|             "zerolinecolor": "rgb(36,36,36)"
 | ||
|            },
 | ||
|            "yaxis": {
 | ||
|             "backgroundcolor": "white",
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "gridwidth": 2,
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showbackground": true,
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside",
 | ||
|             "zeroline": false,
 | ||
|             "zerolinecolor": "rgb(36,36,36)"
 | ||
|            },
 | ||
|            "zaxis": {
 | ||
|             "backgroundcolor": "white",
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "gridwidth": 2,
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showbackground": true,
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside",
 | ||
|             "zeroline": false,
 | ||
|             "zerolinecolor": "rgb(36,36,36)"
 | ||
|            }
 | ||
|           },
 | ||
|           "shapedefaults": {
 | ||
|            "fillcolor": "black",
 | ||
|            "line": {
 | ||
|             "width": 0
 | ||
|            },
 | ||
|            "opacity": 0.3
 | ||
|           },
 | ||
|           "ternary": {
 | ||
|            "aaxis": {
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside"
 | ||
|            },
 | ||
|            "baxis": {
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside"
 | ||
|            },
 | ||
|            "bgcolor": "white",
 | ||
|            "caxis": {
 | ||
|             "gridcolor": "rgb(232,232,232)",
 | ||
|             "linecolor": "rgb(36,36,36)",
 | ||
|             "showgrid": false,
 | ||
|             "showline": true,
 | ||
|             "ticks": "outside"
 | ||
|            }
 | ||
|           },
 | ||
|           "title": {
 | ||
|            "x": 0.05
 | ||
|           },
 | ||
|           "xaxis": {
 | ||
|            "automargin": true,
 | ||
|            "gridcolor": "rgb(232,232,232)",
 | ||
|            "linecolor": "rgb(36,36,36)",
 | ||
|            "showgrid": false,
 | ||
|            "showline": true,
 | ||
|            "ticks": "outside",
 | ||
|            "title": {
 | ||
|             "standoff": 15
 | ||
|            },
 | ||
|            "zeroline": false,
 | ||
|            "zerolinecolor": "rgb(36,36,36)"
 | ||
|           },
 | ||
|           "yaxis": {
 | ||
|            "automargin": true,
 | ||
|            "gridcolor": "rgb(232,232,232)",
 | ||
|            "linecolor": "rgb(36,36,36)",
 | ||
|            "showgrid": false,
 | ||
|            "showline": true,
 | ||
|            "ticks": "outside",
 | ||
|            "title": {
 | ||
|             "standoff": 15
 | ||
|            },
 | ||
|            "zeroline": false,
 | ||
|            "zerolinecolor": "rgb(36,36,36)"
 | ||
|           }
 | ||
|          }
 | ||
|         },
 | ||
|         "title": {
 | ||
|          "font": {
 | ||
|           "color": "Black",
 | ||
|           "size": 22
 | ||
|          },
 | ||
|          "text": "<b>Intertopic Distance Map</b>",
 | ||
|          "x": 0.5,
 | ||
|          "xanchor": "center",
 | ||
|          "y": 0.95,
 | ||
|          "yanchor": "top"
 | ||
|         },
 | ||
|         "width": 650,
 | ||
|         "xaxis": {
 | ||
|          "anchor": "y",
 | ||
|          "domain": [
 | ||
|           0,
 | ||
|           1
 | ||
|          ],
 | ||
|          "range": [
 | ||
|           8.03216586112976,
 | ||
|           16.973989295959473
 | ||
|          ],
 | ||
|          "title": {
 | ||
|           "text": ""
 | ||
|          },
 | ||
|          "visible": false
 | ||
|         },
 | ||
|         "yaxis": {
 | ||
|          "anchor": "x",
 | ||
|          "domain": [
 | ||
|           0,
 | ||
|           1
 | ||
|          ],
 | ||
|          "range": [
 | ||
|           1.4220046877861023,
 | ||
|           6.437612867355346
 | ||
|          ],
 | ||
|          "title": {
 | ||
|           "text": ""
 | ||
|          },
 | ||
|          "visible": false
 | ||
|         }
 | ||
|        }
 | ||
|       }
 | ||
|      },
 | ||
|      "metadata": {},
 | ||
|      "output_type": "display_data"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "topic_model.visualize_topics()"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": null,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": []
 | ||
|   }
 | ||
|  ],
 | ||
|  "metadata": {
 | ||
|   "kernelspec": {
 | ||
|    "display_name": "Python 3 (ipykernel)",
 | ||
|    "language": "python",
 | ||
|    "name": "python3"
 | ||
|   },
 | ||
|   "language_info": {
 | ||
|    "codemirror_mode": {
 | ||
|     "name": "ipython",
 | ||
|     "version": 3
 | ||
|    },
 | ||
|    "file_extension": ".py",
 | ||
|    "mimetype": "text/x-python",
 | ||
|    "name": "python",
 | ||
|    "nbconvert_exporter": "python",
 | ||
|    "pygments_lexer": "ipython3",
 | ||
|    "version": "3.8.15"
 | ||
|   }
 | ||
|  },
 | ||
|  "nbformat": 4,
 | ||
|  "nbformat_minor": 2
 | ||
| }
 |