{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import General Use Packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import arxiv # Interact with arXiv api to scrape papers\n", "from sentence_transformers import (\n", " SentenceTransformer,\n", ") # Use Hugging Face Embedding for Topic Modelling\n", "from bertopic import BERTopic # Package for Topic Modelling\n", "from tqdm import tqdm # Progress Bar When Iterating\n", "import glob # Identify Files in Directory\n", "import os # Delete Files in Directory\n", "import pandas as pd # Dataframe Manipulation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from unstructured.partition.auto import partition # Base Function to Partition PDF\n", "from unstructured.staging.base import (\n", " convert_to_dict,\n", ") # Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n", "from unstructured.cleaners.core import (\n", " clean,\n", " remove_punctuation,\n", " clean_non_ascii_chars,\n", ") # Cleaning Functions\n", "import re # Create Custom Cleaning Function\n", "import nltk # Toolkit for more advanced pre-processing\n", "from nltk.corpus import stopwords # list of stopwords to remove\n", "from typing import List # Type Hinting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup NLTK" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/pravinsanthanam/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download(\"stopwords\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create Function to Extract PDFs About Machine Learning from arXiv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n", " \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n", "\n", " Args:\n", " query (str): query for arXiv API\n", " max_results (int, optional): Number of Papers to get back. Defaults to 100.\n", "\n", " Returns:\n", " paper_texts (list[str]): Return list of narrative texts for each paper\n", " \"\"\"\n", " # Get List of Arxiv Papers Matching Our Query\n", " arxiv_papers = list(\n", " arxiv.Search(\n", " query=query,\n", " max_results=max_results,\n", " sort_by=arxiv.SortCriterion.Relevance,\n", " sort_order=arxiv.SortOrder.Descending,\n", " ).results()\n", " )\n", "\n", " # Loop Through PDFs, Download and Pre-Process and Then Delete\n", " paper_texts = []\n", " for paper in tqdm(arxiv_papers):\n", " paper.download_pdf()\n", " pdf_file = glob.glob(\"*.pdf\")[0]\n", " elements = partition(pdf_file) # Partition PDF Using Unstructured\n", " isd = convert_to_dict(elements) # Convert List of Elements to List of Dictionaries\n", " narrative_texts = [\n", " element[\"text\"] for element in isd if element[\"type\"] == \"NarrativeText\"\n", " ] # Only Keep Narrative Text and Combine Into One String\n", " os.remove(pdf_file) # Delete PDF\n", " paper_texts += narrative_texts\n", " return paper_texts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n" ] } ], "source": [ "paper_texts = get_arxiv_paper_texts(query=\"natural language processing\", max_results=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Narrative Texts Through Custom Cleaner Function Using Unstructured" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Narrative Texts to Run Through Topic Modelling: 1711\n" ] } ], "source": [ "# Stopwords to Remove\n", "stop_words = set(stopwords.words(\"english\"))\n", "\n", "\n", "# Function to Apply Whatever Cleaning Functionality to Each Narrative Text Element\n", "def custom_clean_function(narrative_text: str) -> str:\n", " \"\"\"Apply Mix of Unstructured Cleaning Functions With Some Custom Functionality to Pre-Process Narrative Text\n", "\n", " Args:\n", " narrative_text (str): Narrative Text or Any Other Sentence\n", "\n", " Returns:\n", " cleaned_text (str): Text after going through all the cleaning procedures\n", " \"\"\"\n", " remove_numbers = lambda text: re.sub(\n", " r\"\\d+\", \"\", text\n", " ) # lambda function to remove all punctuation\n", " cleaned_text = remove_numbers(narrative_text) # Apply Custom Lambda\n", " cleaned_text = clean(\n", " cleaned_text,\n", " extra_whitespace=True,\n", " dashes=True,\n", " bullets=True,\n", " trailing_punctuation=True,\n", " lowercase=True,\n", " ) # Apply Basic Clean Function With All the Options\n", " cleaned_text = remove_punctuation(cleaned_text) # Remove all punctuation\n", " cleaned_text = \" \".join(\n", " [word for word in cleaned_text.split() if word not in stop_words]\n", " ) # remove stop words\n", " return cleaned_text\n", "\n", "\n", "# Apply Function to Paper Texts\n", "cleaned_paper_texts = [custom_clean_function(text) for text in paper_texts]\n", "\n", "# Count Narratve Texts\n", "print(\n", " \"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts))\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Choose Which Hugging Face Model You Want to Use\n", "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "# Initialize Model\n", "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a6ebe3cb185049bd8d37742f2451cbe0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/54 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n", "2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n", "2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n", "2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n" ] } ], "source": [ "# Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n", "topic_model.fit(cleaned_paper_texts)\n", "\n", "# Store Document-Topic Info\n", "doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n", "\n", "# Store Topic Info\n", "topic_info = pd.DataFrame(topic_model.get_topics())\n", "topic_info = topic_info.applymap(lambda x: x[0])\n", "topic_info.columns = [\"topic_{}\".format(col + 1) for col in topic_info.columns]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checkout Keywords for Each Topic" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | topic_0 | \n", "topic_1 | \n", "topic_2 | \n", "topic_3 | \n", "topic_4 | \n", "topic_5 | \n", "topic_6 | \n", "topic_7 | \n", "topic_8 | \n", "topic_9 | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "neural | \n", "language | \n", "state | \n", "function | \n", "cost | \n", "publication | \n", "graph | \n", "llama | \n", "tangkhul | \n", "want | \n", "
1 | \n", "network | \n", "natural | \n", "rnn | \n", "distribution | \n", "function | \n", "april | \n", "computation | \n", "like | \n", "compound | \n", "edu | \n", "
2 | \n", "function | \n", "model | \n", "memory | \n", "output | \n", "sgd | \n", "syst | \n", "node | \n", "south | \n", "root | \n", "dsontagcoursesinferenceslidespseudolikelihoodn... | \n", "
3 | \n", "networks | \n", "word | \n", "vector | \n", "class | \n", "training | \n", "technol | \n", "nodes | \n", "animal | \n", "morphological | \n", "regardlessly | \n", "
4 | \n", "one | \n", "planning | \n", "input | \n", "tanh | \n", "expected | \n", "date | \n", "backward | \n", "america | \n", "verbs | \n", "satisfied | \n", "
5 | \n", "input | \n", "words | \n", "network | \n", "data | \n", "optimization | \n", "vol | \n", "function | \n", "translation | \n", "noun | \n", "november | \n", "
6 | \n", "vector | \n", "based | \n", "recurrent | \n", "yˆ | \n", "algorithm | \n", "intell | \n", "backpropagation | \n", "french | \n", "roots | \n", "tune | \n", "
7 | \n", "language | \n", "processing | \n", "sequence | \n", "loss | \n", "set | \n", "acm | \n", "algorithm | \n", "cute | \n", "adjectives | \n", "return | \n", "
8 | \n", "model | \n", "models | \n", "neural | \n", "activation | \n", "validation | \n", "article | \n", "parameters | \n", "formation | \n", "fully | \n", "|
9 | \n", "training | \n", "data | \n", "lstm | \n", "softmax | \n", "rate | \n", "trans | \n", "output | \n", "domesticated | \n", "language | \n", "results | \n", "