Matt Robinson d9c035edb1
docs: no more bricks (#1967)
### Summary

We no longer use the "bricks" terminology for partioning functions, etc
in the library. This PR updates various references to bricks within the
repo and the docs. This is just an initial pass to swap the terminology
out, it'll likely be helpful to reorganize the docs a bit as well.

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
2023-11-02 09:43:26 -05:00

1847 lines
47 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import General Use Packages"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import arxiv # Interact with arXiv api to scrape papers\n",
"from sentence_transformers import (\n",
" SentenceTransformer,\n",
") # Use Hugging Face Embedding for Topic Modelling\n",
"from bertopic import BERTopic # Package for Topic Modelling\n",
"from tqdm import tqdm # Progress Bar When Iterating\n",
"import glob # Identify Files in Directory\n",
"import os # Delete Files in Directory\n",
"import pandas as pd # Dataframe Manipulation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Functions"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from unstructured.partition.auto import partition # Base Function to Partition PDF\n",
"from unstructured.staging.base import (\n",
" convert_to_dict,\n",
") # Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n",
"from unstructured.cleaners.core import (\n",
" clean,\n",
" remove_punctuation,\n",
" clean_non_ascii_chars,\n",
") # Cleaning Functions\n",
"import re # Create Custom Cleaning Function\n",
"import nltk # Toolkit for more advanced pre-processing\n",
"from nltk.corpus import stopwords # list of stopwords to remove\n",
"from typing import List # Type Hinting"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup NLTK"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /Users/pravinsanthanam/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nltk.download(\"stopwords\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Function to Extract PDFs About Machine Learning from arXiv"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n",
" \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n",
"\n",
" Args:\n",
" query (str): query for arXiv API\n",
" max_results (int, optional): Number of Papers to get back. Defaults to 100.\n",
"\n",
" Returns:\n",
" paper_texts (list[str]): Return list of narrative texts for each paper\n",
" \"\"\"\n",
" # Get List of Arxiv Papers Matching Our Query\n",
" arxiv_papers = list(\n",
" arxiv.Search(\n",
" query=query,\n",
" max_results=max_results,\n",
" sort_by=arxiv.SortCriterion.Relevance,\n",
" sort_order=arxiv.SortOrder.Descending,\n",
" ).results()\n",
" )\n",
"\n",
" # Loop Through PDFs, Download and Pre-Process and Then Delete\n",
" paper_texts = []\n",
" for paper in tqdm(arxiv_papers):\n",
" paper.download_pdf()\n",
" pdf_file = glob.glob(\"*.pdf\")[0]\n",
" elements = partition(pdf_file) # Partition PDF Using Unstructured\n",
" isd = convert_to_dict(elements) # Convert List of Elements to List of Dictionaries\n",
" narrative_texts = [\n",
" element[\"text\"] for element in isd if element[\"type\"] == \"NarrativeText\"\n",
" ] # Only Keep Narrative Text and Combine Into One String\n",
" os.remove(pdf_file) # Delete PDF\n",
" paper_texts += narrative_texts\n",
" return paper_texts"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n"
]
}
],
"source": [
"paper_texts = get_arxiv_paper_texts(query=\"natural language processing\", max_results=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run Narrative Texts Through Custom Cleaner Function Using Unstructured"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Narrative Texts to Run Through Topic Modelling: 1711\n"
]
}
],
"source": [
"# Stopwords to Remove\n",
"stop_words = set(stopwords.words(\"english\"))\n",
"\n",
"\n",
"# Function to Apply Whatever Cleaning Functionality to Each Narrative Text Element\n",
"def custom_clean_function(narrative_text: str) -> str:\n",
" \"\"\"Apply Mix of Unstructured Cleaning Functions With Some Custom Functionality to Pre-Process Narrative Text\n",
"\n",
" Args:\n",
" narrative_text (str): Narrative Text or Any Other Sentence\n",
"\n",
" Returns:\n",
" cleaned_text (str): Text after going through all the cleaning procedures\n",
" \"\"\"\n",
" remove_numbers = lambda text: re.sub(\n",
" r\"\\d+\", \"\", text\n",
" ) # lambda function to remove all punctuation\n",
" cleaned_text = remove_numbers(narrative_text) # Apply Custom Lambda\n",
" cleaned_text = clean(\n",
" cleaned_text,\n",
" extra_whitespace=True,\n",
" dashes=True,\n",
" bullets=True,\n",
" trailing_punctuation=True,\n",
" lowercase=True,\n",
" ) # Apply Basic Clean Function With All the Options\n",
" cleaned_text = remove_punctuation(cleaned_text) # Remove all punctuation\n",
" cleaned_text = \" \".join(\n",
" [word for word in cleaned_text.split() if word not in stop_words]\n",
" ) # remove stop words\n",
" return cleaned_text\n",
"\n",
"\n",
"# Apply Function to Paper Texts\n",
"cleaned_paper_texts = [custom_clean_function(text) for text in paper_texts]\n",
"\n",
"# Count Narratve Texts\n",
"print(\n",
" \"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts))\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Choose Which Hugging Face Model You Want to Use\n",
"sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
"\n",
"# Initialize Model\n",
"topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a6ebe3cb185049bd8d37742f2451cbe0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Batches: 0%| | 0/54 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n",
"2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n",
"2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n",
"2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n"
]
}
],
"source": [
"# Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n",
"topic_model.fit(cleaned_paper_texts)\n",
"\n",
"# Store Document-Topic Info\n",
"doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n",
"\n",
"# Store Topic Info\n",
"topic_info = pd.DataFrame(topic_model.get_topics())\n",
"topic_info = topic_info.applymap(lambda x: x[0])\n",
"topic_info.columns = [\"topic_{}\".format(col + 1) for col in topic_info.columns]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Checkout Keywords for Each Topic"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>topic_0</th>\n",
" <th>topic_1</th>\n",
" <th>topic_2</th>\n",
" <th>topic_3</th>\n",
" <th>topic_4</th>\n",
" <th>topic_5</th>\n",
" <th>topic_6</th>\n",
" <th>topic_7</th>\n",
" <th>topic_8</th>\n",
" <th>topic_9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neural</td>\n",
" <td>language</td>\n",
" <td>state</td>\n",
" <td>function</td>\n",
" <td>cost</td>\n",
" <td>publication</td>\n",
" <td>graph</td>\n",
" <td>llama</td>\n",
" <td>tangkhul</td>\n",
" <td>want</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>network</td>\n",
" <td>natural</td>\n",
" <td>rnn</td>\n",
" <td>distribution</td>\n",
" <td>function</td>\n",
" <td>april</td>\n",
" <td>computation</td>\n",
" <td>like</td>\n",
" <td>compound</td>\n",
" <td>edu</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>function</td>\n",
" <td>model</td>\n",
" <td>memory</td>\n",
" <td>output</td>\n",
" <td>sgd</td>\n",
" <td>syst</td>\n",
" <td>node</td>\n",
" <td>south</td>\n",
" <td>root</td>\n",
" <td>dsontagcoursesinferenceslidespseudolikelihoodn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>networks</td>\n",
" <td>word</td>\n",
" <td>vector</td>\n",
" <td>class</td>\n",
" <td>training</td>\n",
" <td>technol</td>\n",
" <td>nodes</td>\n",
" <td>animal</td>\n",
" <td>morphological</td>\n",
" <td>regardlessly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>one</td>\n",
" <td>planning</td>\n",
" <td>input</td>\n",
" <td>tanh</td>\n",
" <td>expected</td>\n",
" <td>date</td>\n",
" <td>backward</td>\n",
" <td>america</td>\n",
" <td>verbs</td>\n",
" <td>satisfied</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>input</td>\n",
" <td>words</td>\n",
" <td>network</td>\n",
" <td>data</td>\n",
" <td>optimization</td>\n",
" <td>vol</td>\n",
" <td>function</td>\n",
" <td>translation</td>\n",
" <td>noun</td>\n",
" <td>november</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>vector</td>\n",
" <td>based</td>\n",
" <td>recurrent</td>\n",
" <td>yˆ</td>\n",
" <td>algorithm</td>\n",
" <td>intell</td>\n",
" <td>backpropagation</td>\n",
" <td>french</td>\n",
" <td>roots</td>\n",
" <td>tune</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>language</td>\n",
" <td>processing</td>\n",
" <td>sequence</td>\n",
" <td>loss</td>\n",
" <td>set</td>\n",
" <td>acm</td>\n",
" <td>algorithm</td>\n",
" <td>cute</td>\n",
" <td>adjectives</td>\n",
" <td>return</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>model</td>\n",
" <td>models</td>\n",
" <td>neural</td>\n",
" <td>activation</td>\n",
" <td>validation</td>\n",
" <td>article</td>\n",
" <td>parameters</td>\n",
" <td>google</td>\n",
" <td>formation</td>\n",
" <td>fully</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>training</td>\n",
" <td>data</td>\n",
" <td>lstm</td>\n",
" <td>softmax</td>\n",
" <td>rate</td>\n",
" <td>trans</td>\n",
" <td>output</td>\n",
" <td>domesticated</td>\n",
" <td>language</td>\n",
" <td>results</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" topic_0 topic_1 topic_2 topic_3 topic_4 topic_5 \\\n",
"0 neural language state function cost publication \n",
"1 network natural rnn distribution function april \n",
"2 function model memory output sgd syst \n",
"3 networks word vector class training technol \n",
"4 one planning input tanh expected date \n",
"5 input words network data optimization vol \n",
"6 vector based recurrent yˆ algorithm intell \n",
"7 language processing sequence loss set acm \n",
"8 model models neural activation validation article \n",
"9 training data lstm softmax rate trans \n",
"\n",
" topic_6 topic_7 topic_8 \\\n",
"0 graph llama tangkhul \n",
"1 computation like compound \n",
"2 node south root \n",
"3 nodes animal morphological \n",
"4 backward america verbs \n",
"5 function translation noun \n",
"6 backpropagation french roots \n",
"7 algorithm cute adjectives \n",
"8 parameters google formation \n",
"9 output domesticated language \n",
"\n",
" topic_9 \n",
"0 want \n",
"1 edu \n",
"2 dsontagcoursesinferenceslidespseudolikelihoodn... \n",
"3 regardlessly \n",
"4 satisfied \n",
"5 november \n",
"6 tune \n",
"7 return \n",
"8 fully \n",
"9 results "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(topic_info)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualize Topics"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"customdata": [
[
0,
"language | natural | model | word | planning",
723
],
[
1,
"state | rnn | memory | vector | input",
198
],
[
2,
"function | distribution | output | class | tanh",
122
],
[
3,
"cost | function | sgd | training | expected",
61
],
[
4,
"publication | april | syst | technol | date",
57
],
[
5,
"graph | computation | node | nodes | backward",
46
],
[
6,
"llama | like | south | animal | america",
29
],
[
7,
"tangkhul | compound | root | morphological | verbs",
17
],
[
8,
"want | edu | dsontagcoursesinferenceslidespseudolikelihoodnotespdf | regardlessly | satisfied",
13
]
],
"hovertemplate": "<b>Topic %{customdata[0]}</b><br>%{customdata[1]}<br>Size: %{customdata[2]}",
"legendgroup": "",
"marker": {
"color": "#B0BEC5",
"line": {
"color": "DarkSlateGrey",
"width": 2
},
"size": [
723,
198,
122,
61,
57,
46,
29,
17,
13
],
"sizemode": "area",
"sizeref": 0.451875,
"symbol": "circle"
},
"mode": "markers",
"name": "",
"orientation": "v",
"showlegend": false,
"type": "scatter",
"x": [
14.759990692138672,
14.329012870788574,
10.99558162689209,
9.891719818115234,
11.191701889038086,
9.449606895446777,
11.662773132324219,
14.039092063903809,
12.023329734802246
],
"xaxis": "x",
"y": [
1.6729466915130615,
2.2927768230438232,
5.36309289932251,
5.59792423248291,
4.721500873565674,
5.3096089363098145,
5.3371052742004395,
1.8039934635162354,
4.149565696716309
],
"yaxis": "y"
}
],
"layout": {
"annotations": [
{
"showarrow": false,
"text": "D1",
"x": 8.03216586112976,
"y": 3.929808777570724,
"yshift": 10
},
{
"showarrow": false,
"text": "D2",
"x": 12.503077578544616,
"xshift": 10,
"y": 6.437612867355346
}
],
"height": 650,
"hoverlabel": {
"bgcolor": "white",
"font": {
"family": "Rockwell",
"size": 16
}
},
"legend": {
"itemsizing": "constant",
"tracegroupgap": 0
},
"margin": {
"t": 60
},
"shapes": [
{
"line": {
"color": "#CFD8DC",
"width": 2
},
"type": "line",
"x0": 12.503077578544616,
"x1": 12.503077578544616,
"y0": 1.4220046877861023,
"y1": 6.437612867355346
},
{
"line": {
"color": "#9E9E9E",
"width": 2
},
"type": "line",
"x0": 8.03216586112976,
"x1": 16.973989295959473,
"y0": 3.929808777570724,
"y1": 3.929808777570724
}
],
"sliders": [
{
"active": 0,
"pad": {
"t": 50
},
"steps": [
{
"args": [
{
"marker.color": [
[
"red",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5"
]
]
}
],
"label": "Topic 0",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"red",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5"
]
]
}
],
"label": "Topic 1",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"#B0BEC5",
"red",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5"
]
]
}
],
"label": "Topic 2",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"red",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5"
]
]
}
],
"label": "Topic 3",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"red",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5"
]
]
}
],
"label": "Topic 4",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"red",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5"
]
]
}
],
"label": "Topic 5",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"red",
"#B0BEC5",
"#B0BEC5"
]
]
}
],
"label": "Topic 6",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"red",
"#B0BEC5"
]
]
}
],
"label": "Topic 7",
"method": "update"
},
{
"args": [
{
"marker.color": [
[
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"#B0BEC5",
"red"
]
]
}
],
"label": "Topic 8",
"method": "update"
}
]
}
],
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "rgb(36,36,36)"
},
"error_y": {
"color": "rgb(36,36,36)"
},
"marker": {
"line": {
"color": "white",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "white",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "rgb(36,36,36)",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "rgb(36,36,36)"
},
"baxis": {
"endlinecolor": "rgb(36,36,36)",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "rgb(36,36,36)"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"colorscale": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"colorscale": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"colorscale": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"line": {
"color": "white",
"width": 0.6
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"colorscale": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"colorscale": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
},
"colorscale": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "rgb(237,237,237)"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "rgb(217,217,217)"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 1,
"tickcolor": "rgb(36,36,36)",
"ticks": "outside"
}
},
"colorscale": {
"diverging": [
[
0,
"rgb(103,0,31)"
],
[
0.1,
"rgb(178,24,43)"
],
[
0.2,
"rgb(214,96,77)"
],
[
0.3,
"rgb(244,165,130)"
],
[
0.4,
"rgb(253,219,199)"
],
[
0.5,
"rgb(247,247,247)"
],
[
0.6,
"rgb(209,229,240)"
],
[
0.7,
"rgb(146,197,222)"
],
[
0.8,
"rgb(67,147,195)"
],
[
0.9,
"rgb(33,102,172)"
],
[
1,
"rgb(5,48,97)"
]
],
"sequential": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
],
"sequentialminus": [
[
0,
"#440154"
],
[
0.1111111111111111,
"#482878"
],
[
0.2222222222222222,
"#3e4989"
],
[
0.3333333333333333,
"#31688e"
],
[
0.4444444444444444,
"#26828e"
],
[
0.5555555555555556,
"#1f9e89"
],
[
0.6666666666666666,
"#35b779"
],
[
0.7777777777777778,
"#6ece58"
],
[
0.8888888888888888,
"#b5de2b"
],
[
1,
"#fde725"
]
]
},
"colorway": [
"#1F77B4",
"#FF7F0E",
"#2CA02C",
"#D62728",
"#9467BD",
"#8C564B",
"#E377C2",
"#7F7F7F",
"#BCBD22",
"#17BECF"
],
"font": {
"color": "rgb(36,36,36)"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "white",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "white",
"polar": {
"angularaxis": {
"gridcolor": "rgb(232,232,232)",
"linecolor": "rgb(36,36,36)",
"showgrid": false,
"showline": true,
"ticks": "outside"
},
"bgcolor": "white",
"radialaxis": {
"gridcolor": "rgb(232,232,232)",
"linecolor": "rgb(36,36,36)",
"showgrid": false,
"showline": true,
"ticks": "outside"
}
},
"scene": {
"xaxis": {
"backgroundcolor": "white",
"gridcolor": "rgb(232,232,232)",
"gridwidth": 2,
"linecolor": "rgb(36,36,36)",
"showbackground": true,
"showgrid": false,
"showline": true,
"ticks": "outside",
"zeroline": false,
"zerolinecolor": "rgb(36,36,36)"
},
"yaxis": {
"backgroundcolor": "white",
"gridcolor": "rgb(232,232,232)",
"gridwidth": 2,
"linecolor": "rgb(36,36,36)",
"showbackground": true,
"showgrid": false,
"showline": true,
"ticks": "outside",
"zeroline": false,
"zerolinecolor": "rgb(36,36,36)"
},
"zaxis": {
"backgroundcolor": "white",
"gridcolor": "rgb(232,232,232)",
"gridwidth": 2,
"linecolor": "rgb(36,36,36)",
"showbackground": true,
"showgrid": false,
"showline": true,
"ticks": "outside",
"zeroline": false,
"zerolinecolor": "rgb(36,36,36)"
}
},
"shapedefaults": {
"fillcolor": "black",
"line": {
"width": 0
},
"opacity": 0.3
},
"ternary": {
"aaxis": {
"gridcolor": "rgb(232,232,232)",
"linecolor": "rgb(36,36,36)",
"showgrid": false,
"showline": true,
"ticks": "outside"
},
"baxis": {
"gridcolor": "rgb(232,232,232)",
"linecolor": "rgb(36,36,36)",
"showgrid": false,
"showline": true,
"ticks": "outside"
},
"bgcolor": "white",
"caxis": {
"gridcolor": "rgb(232,232,232)",
"linecolor": "rgb(36,36,36)",
"showgrid": false,
"showline": true,
"ticks": "outside"
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "rgb(232,232,232)",
"linecolor": "rgb(36,36,36)",
"showgrid": false,
"showline": true,
"ticks": "outside",
"title": {
"standoff": 15
},
"zeroline": false,
"zerolinecolor": "rgb(36,36,36)"
},
"yaxis": {
"automargin": true,
"gridcolor": "rgb(232,232,232)",
"linecolor": "rgb(36,36,36)",
"showgrid": false,
"showline": true,
"ticks": "outside",
"title": {
"standoff": 15
},
"zeroline": false,
"zerolinecolor": "rgb(36,36,36)"
}
}
},
"title": {
"font": {
"color": "Black",
"size": 22
},
"text": "<b>Intertopic Distance Map</b>",
"x": 0.5,
"xanchor": "center",
"y": 0.95,
"yanchor": "top"
},
"width": 650,
"xaxis": {
"anchor": "y",
"domain": [
0,
1
],
"range": [
8.03216586112976,
16.973989295959473
],
"title": {
"text": ""
},
"visible": false
},
"yaxis": {
"anchor": "x",
"domain": [
0,
1
],
"range": [
1.4220046877861023,
6.437612867355346
],
"title": {
"text": ""
},
"visible": false
}
}
}
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"topic_model.visualize_topics()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}