{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import General Use Packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import arxiv # Interact with arXiv api to scrape papers\n", "from sentence_transformers import (\n", " SentenceTransformer,\n", ") # Use Hugging Face Embedding for Topic Modelling\n", "from bertopic import BERTopic # Package for Topic Modelling\n", "from tqdm import tqdm # Progress Bar When Iterating\n", "import glob # Identify Files in Directory\n", "import os # Delete Files in Directory\n", "import pandas as pd # Dataframe Manipulation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from unstructured.partition.auto import partition # Base Function to Partition PDF\n", "from unstructured.staging.base import (\n", " convert_to_dict,\n", ") # Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n", "from unstructured.cleaners.core import (\n", " clean,\n", " remove_punctuation,\n", " clean_non_ascii_chars,\n", ") # Cleaning Functions\n", "import re # Create Custom Cleaning Function\n", "import nltk # Toolkit for more advanced pre-processing\n", "from nltk.corpus import stopwords # list of stopwords to remove\n", "from typing import List # Type Hinting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup NLTK" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/pravinsanthanam/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download(\"stopwords\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create Function to Extract PDFs About Machine Learning from arXiv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n", " \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n", "\n", " Args:\n", " query (str): query for arXiv API\n", " max_results (int, optional): Number of Papers to get back. Defaults to 100.\n", "\n", " Returns:\n", " paper_texts (list[str]): Return list of narrative texts for each paper\n", " \"\"\"\n", " # Get List of Arxiv Papers Matching Our Query\n", " arxiv_papers = list(\n", " arxiv.Search(\n", " query=query,\n", " max_results=max_results,\n", " sort_by=arxiv.SortCriterion.Relevance,\n", " sort_order=arxiv.SortOrder.Descending,\n", " ).results()\n", " )\n", "\n", " # Loop Through PDFs, Download and Pre-Process and Then Delete\n", " paper_texts = []\n", " for paper in tqdm(arxiv_papers):\n", " paper.download_pdf()\n", " pdf_file = glob.glob(\"*.pdf\")[0]\n", " elements = partition(pdf_file) # Partition PDF Using Unstructured\n", " isd = convert_to_dict(elements) # Convert List of Elements to List of Dictionaries\n", " narrative_texts = [\n", " element[\"text\"] for element in isd if element[\"type\"] == \"NarrativeText\"\n", " ] # Only Keep Narrative Text and Combine Into One String\n", " os.remove(pdf_file) # Delete PDF\n", " paper_texts += narrative_texts\n", " return paper_texts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n" ] } ], "source": [ "paper_texts = get_arxiv_paper_texts(query=\"natural language processing\", max_results=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Narrative Texts Through Custom Cleaner Function Using Unstructured" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Narrative Texts to Run Through Topic Modelling: 1711\n" ] } ], "source": [ "# Stopwords to Remove\n", "stop_words = set(stopwords.words(\"english\"))\n", "\n", "\n", "# Function to Apply Whatever Cleaning Functionality to Each Narrative Text Element\n", "def custom_clean_function(narrative_text: str) -> str:\n", " \"\"\"Apply Mix of Unstructured Cleaning Functions With Some Custom Functionality to Pre-Process Narrative Text\n", "\n", " Args:\n", " narrative_text (str): Narrative Text or Any Other Sentence\n", "\n", " Returns:\n", " cleaned_text (str): Text after going through all the cleaning procedures\n", " \"\"\"\n", " remove_numbers = lambda text: re.sub(\n", " r\"\\d+\", \"\", text\n", " ) # lambda function to remove all punctuation\n", " cleaned_text = remove_numbers(narrative_text) # Apply Custom Lambda\n", " cleaned_text = clean(\n", " cleaned_text,\n", " extra_whitespace=True,\n", " dashes=True,\n", " bullets=True,\n", " trailing_punctuation=True,\n", " lowercase=True,\n", " ) # Apply Basic Clean Function With All the Options\n", " cleaned_text = remove_punctuation(cleaned_text) # Remove all punctuation\n", " cleaned_text = \" \".join(\n", " [word for word in cleaned_text.split() if word not in stop_words]\n", " ) # remove stop words\n", " return cleaned_text\n", "\n", "\n", "# Apply Function to Paper Texts\n", "cleaned_paper_texts = [custom_clean_function(text) for text in paper_texts]\n", "\n", "# Count Narratve Texts\n", "print(\n", " \"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts))\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Choose Which Hugging Face Model You Want to Use\n", "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "# Initialize Model\n", "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a6ebe3cb185049bd8d37742f2451cbe0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/54 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
topic_0topic_1topic_2topic_3topic_4topic_5topic_6topic_7topic_8topic_9
0neurallanguagestatefunctioncostpublicationgraphllamatangkhulwant
1networknaturalrnndistributionfunctionaprilcomputationlikecompoundedu
2functionmodelmemoryoutputsgdsystnodesouthrootdsontagcoursesinferenceslidespseudolikelihoodn...
3networkswordvectorclasstrainingtechnolnodesanimalmorphologicalregardlessly
4oneplanninginputtanhexpecteddatebackwardamericaverbssatisfied
5inputwordsnetworkdataoptimizationvolfunctiontranslationnounnovember
6vectorbasedrecurrentalgorithmintellbackpropagationfrenchrootstune
7languageprocessingsequencelosssetacmalgorithmcuteadjectivesreturn
8modelmodelsneuralactivationvalidationarticleparametersgoogleformationfully
9trainingdatalstmsoftmaxratetransoutputdomesticatedlanguageresults
\n", "" ], "text/plain": [ " topic_0 topic_1 topic_2 topic_3 topic_4 topic_5 \\\n", "0 neural language state function cost publication \n", "1 network natural rnn distribution function april \n", "2 function model memory output sgd syst \n", "3 networks word vector class training technol \n", "4 one planning input tanh expected date \n", "5 input words network data optimization vol \n", "6 vector based recurrent yˆ algorithm intell \n", "7 language processing sequence loss set acm \n", "8 model models neural activation validation article \n", "9 training data lstm softmax rate trans \n", "\n", " topic_6 topic_7 topic_8 \\\n", "0 graph llama tangkhul \n", "1 computation like compound \n", "2 node south root \n", "3 nodes animal morphological \n", "4 backward america verbs \n", "5 function translation noun \n", "6 backpropagation french roots \n", "7 algorithm cute adjectives \n", "8 parameters google formation \n", "9 output domesticated language \n", "\n", " topic_9 \n", "0 want \n", "1 edu \n", "2 dsontagcoursesinferenceslidespseudolikelihoodn... \n", "3 regardlessly \n", "4 satisfied \n", "5 november \n", "6 tune \n", "7 return \n", "8 fully \n", "9 results " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(topic_info)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualize Topics" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "customdata": [ [ 0, "language | natural | model | word | planning", 723 ], [ 1, "state | rnn | memory | vector | input", 198 ], [ 2, "function | distribution | output | class | tanh", 122 ], [ 3, "cost | function | sgd | training | expected", 61 ], [ 4, "publication | april | syst | technol | date", 57 ], [ 5, "graph | computation | node | nodes | backward", 46 ], [ 6, "llama | like | south | animal | america", 29 ], [ 7, "tangkhul | compound | root | morphological | verbs", 17 ], [ 8, "want | edu | dsontagcoursesinferenceslidespseudolikelihoodnotespdf | regardlessly | satisfied", 13 ] ], "hovertemplate": "Topic %{customdata[0]}
%{customdata[1]}
Size: %{customdata[2]}", "legendgroup": "", "marker": { "color": "#B0BEC5", "line": { "color": "DarkSlateGrey", "width": 2 }, "size": [ 723, 198, 122, 61, 57, 46, 29, 17, 13 ], "sizemode": "area", "sizeref": 0.451875, "symbol": "circle" }, "mode": "markers", "name": "", "orientation": "v", "showlegend": false, "type": "scatter", "x": [ 14.759990692138672, 14.329012870788574, 10.99558162689209, 9.891719818115234, 11.191701889038086, 9.449606895446777, 11.662773132324219, 14.039092063903809, 12.023329734802246 ], "xaxis": "x", "y": [ 1.6729466915130615, 2.2927768230438232, 5.36309289932251, 5.59792423248291, 4.721500873565674, 5.3096089363098145, 5.3371052742004395, 1.8039934635162354, 4.149565696716309 ], "yaxis": "y" } ], "layout": { "annotations": [ { "showarrow": false, "text": "D1", "x": 8.03216586112976, "y": 3.929808777570724, "yshift": 10 }, { "showarrow": false, "text": "D2", "x": 12.503077578544616, "xshift": 10, "y": 6.437612867355346 } ], "height": 650, "hoverlabel": { "bgcolor": "white", "font": { "family": "Rockwell", "size": 16 } }, "legend": { "itemsizing": "constant", "tracegroupgap": 0 }, "margin": { "t": 60 }, "shapes": [ { "line": { "color": "#CFD8DC", "width": 2 }, "type": "line", "x0": 12.503077578544616, "x1": 12.503077578544616, "y0": 1.4220046877861023, "y1": 6.437612867355346 }, { "line": { "color": "#9E9E9E", "width": 2 }, "type": "line", "x0": 8.03216586112976, "x1": 16.973989295959473, "y0": 3.929808777570724, "y1": 3.929808777570724 } ], "sliders": [ { "active": 0, "pad": { "t": 50 }, "steps": [ { "args": [ { "marker.color": [ [ "red", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5" ] ] } ], "label": "Topic 0", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "red", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5" ] ] } ], "label": "Topic 1", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "#B0BEC5", "red", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5" ] ] } ], "label": "Topic 2", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "#B0BEC5", "#B0BEC5", "red", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5" ] ] } ], "label": "Topic 3", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "red", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5" ] ] } ], "label": "Topic 4", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "red", "#B0BEC5", "#B0BEC5", "#B0BEC5" ] ] } ], "label": "Topic 5", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "red", "#B0BEC5", "#B0BEC5" ] ] } ], "label": "Topic 6", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "red", "#B0BEC5" ] ] } ], "label": "Topic 7", "method": "update" }, { "args": [ { "marker.color": [ [ "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "#B0BEC5", "red" ] ] } ], "label": "Topic 8", "method": "update" } ] } ], "template": { "data": { "bar": [ { "error_x": { "color": "rgb(36,36,36)" }, "error_y": { "color": "rgb(36,36,36)" }, "marker": { "line": { "color": "white", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "white", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "rgb(36,36,36)", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "rgb(36,36,36)" }, "baxis": { "endlinecolor": "rgb(36,36,36)", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "rgb(36,36,36)" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "colorscale": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "colorscale": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "colorscale": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "line": { "color": "white", "width": 0.6 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "colorscale": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "colorscale": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" }, "colorscale": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "rgb(237,237,237)" }, "line": { "color": "white" } }, "header": { "fill": { "color": "rgb(217,217,217)" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 1, "tickcolor": "rgb(36,36,36)", "ticks": "outside" } }, "colorscale": { "diverging": [ [ 0, "rgb(103,0,31)" ], [ 0.1, "rgb(178,24,43)" ], [ 0.2, "rgb(214,96,77)" ], [ 0.3, "rgb(244,165,130)" ], [ 0.4, "rgb(253,219,199)" ], [ 0.5, "rgb(247,247,247)" ], [ 0.6, "rgb(209,229,240)" ], [ 0.7, "rgb(146,197,222)" ], [ 0.8, "rgb(67,147,195)" ], [ 0.9, "rgb(33,102,172)" ], [ 1, "rgb(5,48,97)" ] ], "sequential": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ], "sequentialminus": [ [ 0, "#440154" ], [ 0.1111111111111111, "#482878" ], [ 0.2222222222222222, "#3e4989" ], [ 0.3333333333333333, "#31688e" ], [ 0.4444444444444444, "#26828e" ], [ 0.5555555555555556, "#1f9e89" ], [ 0.6666666666666666, "#35b779" ], [ 0.7777777777777778, "#6ece58" ], [ 0.8888888888888888, "#b5de2b" ], [ 1, "#fde725" ] ] }, "colorway": [ "#1F77B4", "#FF7F0E", "#2CA02C", "#D62728", "#9467BD", "#8C564B", "#E377C2", "#7F7F7F", "#BCBD22", "#17BECF" ], "font": { "color": "rgb(36,36,36)" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "white", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "white", "polar": { "angularaxis": { "gridcolor": "rgb(232,232,232)", "linecolor": "rgb(36,36,36)", "showgrid": false, "showline": true, "ticks": "outside" }, "bgcolor": "white", "radialaxis": { "gridcolor": "rgb(232,232,232)", "linecolor": "rgb(36,36,36)", "showgrid": false, "showline": true, "ticks": "outside" } }, "scene": { "xaxis": { "backgroundcolor": "white", "gridcolor": "rgb(232,232,232)", "gridwidth": 2, "linecolor": "rgb(36,36,36)", "showbackground": true, "showgrid": false, "showline": true, "ticks": "outside", "zeroline": false, "zerolinecolor": "rgb(36,36,36)" }, "yaxis": { "backgroundcolor": "white", "gridcolor": "rgb(232,232,232)", "gridwidth": 2, "linecolor": "rgb(36,36,36)", "showbackground": true, "showgrid": false, "showline": true, "ticks": "outside", "zeroline": false, "zerolinecolor": "rgb(36,36,36)" }, "zaxis": { "backgroundcolor": "white", "gridcolor": "rgb(232,232,232)", "gridwidth": 2, "linecolor": "rgb(36,36,36)", "showbackground": true, "showgrid": false, "showline": true, "ticks": "outside", "zeroline": false, "zerolinecolor": "rgb(36,36,36)" } }, "shapedefaults": { "fillcolor": "black", "line": { "width": 0 }, "opacity": 0.3 }, "ternary": { "aaxis": { "gridcolor": "rgb(232,232,232)", "linecolor": "rgb(36,36,36)", "showgrid": false, "showline": true, "ticks": "outside" }, "baxis": { "gridcolor": "rgb(232,232,232)", "linecolor": "rgb(36,36,36)", "showgrid": false, "showline": true, "ticks": "outside" }, "bgcolor": "white", "caxis": { "gridcolor": "rgb(232,232,232)", "linecolor": "rgb(36,36,36)", "showgrid": false, "showline": true, "ticks": "outside" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "rgb(232,232,232)", "linecolor": "rgb(36,36,36)", "showgrid": false, "showline": true, "ticks": "outside", "title": { "standoff": 15 }, "zeroline": false, "zerolinecolor": "rgb(36,36,36)" }, "yaxis": { "automargin": true, "gridcolor": "rgb(232,232,232)", "linecolor": "rgb(36,36,36)", "showgrid": false, "showline": true, "ticks": "outside", "title": { "standoff": 15 }, "zeroline": false, "zerolinecolor": "rgb(36,36,36)" } } }, "title": { "font": { "color": "Black", "size": 22 }, "text": "Intertopic Distance Map", "x": 0.5, "xanchor": "center", "y": 0.95, "yanchor": "top" }, "width": 650, "xaxis": { "anchor": "y", "domain": [ 0, 1 ], "range": [ 8.03216586112976, 16.973989295959473 ], "title": { "text": "" }, "visible": false }, "yaxis": { "anchor": "x", "domain": [ 0, 1 ], "range": [ 1.4220046877861023, 6.437612867355346 ], "title": { "text": "" }, "visible": false } } } }, "metadata": {}, "output_type": "display_data" } ], "source": [ "topic_model.visualize_topics()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" } }, "nbformat": 4, "nbformat_minor": 2 }