{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import General Use Packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import arxiv # Interact with arXiv api to scrape papers\n", "from sentence_transformers import SentenceTransformer # Use Hugging Face Embedding for Topic Modelling\n", "from bertopic import BERTopic # Package for Topic Modelling\n", "from tqdm import tqdm #Progress Bar When Iterating\n", "import glob #Identify Files in Directory\n", "import os #Delete Files in Directory\n", "import pandas as pd #Dataframe Manipulation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Bricks" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from unstructured.partition.auto import partition #Base Function to Partition PDF\n", "from unstructured.staging.base import convert_to_dict #Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n", "from unstructured.cleaners.core import clean, remove_punctuation, clean_non_ascii_chars #Cleaning Bricks\n", "import re #Create Custom Cleaning Brick\n", "import nltk #Toolkit for more advanced pre-processing\n", "from nltk.corpus import stopwords #list of stopwords to remove\n", "from typing import List #Type Hinting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup NLTK" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/pravinsanthanam/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('stopwords')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create Function to Extract PDFs About Machine Learning from arXiv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n", " \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n", "\n", " Args:\n", " query (str): query for arXiv API\n", " max_results (int, optional): Number of Papers to get back. Defaults to 100.\n", "\n", " Returns:\n", " paper_texts (list[str]): Return list of narrative texts for each paper\n", " \"\"\"\n", " #Get List of Arxiv Papers Matching Our Query\n", " arxiv_papers = list(\n", " arxiv.Search(\n", " query = query,\n", " max_results = max_results,\n", " sort_by = arxiv.SortCriterion.Relevance,\n", " sort_order = arxiv.SortOrder.Descending\n", " )\n", " .results()\n", " )\n", "\n", " #Loop Through PDFs, Download and Pre-Process and Then Delete\n", " paper_texts = []\n", " for paper in tqdm(arxiv_papers):\n", " paper.download_pdf()\n", " pdf_file = glob.glob('*.pdf')[0]\n", " elements = partition(pdf_file) #Partition PDF Using Unstructured\n", " isd = convert_to_dict(elements) #Convert List of Elements to List of Dictionaries\n", " narrative_texts = [element['text'] for element in isd if element['type'] == 'NarrativeText'] #Only Keep Narrative Text and Combine Into One String\n", " os.remove(pdf_file) #Delete PDF\n", " paper_texts += narrative_texts\n", " return paper_texts\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n" ] } ], "source": [ "paper_texts = get_arxiv_paper_texts(query='natural language processing', max_results=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Narrative Texts Through Custom Cleaner Brick Using Unstructured" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Narrative Texts to Run Through Topic Modelling: 1711\n" ] } ], "source": [ "#Stopwords to Remove\n", "stop_words = set(stopwords.words('english'))\n", "\n", "#Function to Apply Whatever Cleaning Brick Functionality to Each Narrative Text Element\n", "def custom_clean_brick(narrative_text: str) -> str:\n", " \"\"\"Apply Mix of Unstructured Cleaning Bricks With Some Custom Functionality to Pre-Process Narrative Text\n", "\n", " Args:\n", " narrative_text (str): Narrative Text or Any Other Sentence\n", "\n", " Returns:\n", " cleaned_text (str): Text after going through all the cleaning procedures\n", " \"\"\"\n", " remove_numbers = lambda text: re.sub(r'\\d+', \"\", text) #lambda function to remove all punctuation\n", " cleaned_text = remove_numbers(narrative_text) #Apply Custom Lambda\n", " cleaned_text = clean(cleaned_text, extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True, lowercase=True) #Apply Basic Clean Brick With All the Options\n", " cleaned_text = remove_punctuation(cleaned_text) #Remove all punctuation\n", " cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words]) #remove stop words\n", " return cleaned_text\n", "\n", "#Apply Function to Paper Texts\n", "cleaned_paper_texts = [custom_clean_brick(text) for text in paper_texts]\n", "\n", "#Count Narratve Texts\n", "print(\"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#Choose Which Hugging Face Model You Want to Use\n", "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "#Initialize Model\n", "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a6ebe3cb185049bd8d37742f2451cbe0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/54 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n", "2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n", "2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n", "2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n" ] } ], "source": [ "#Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n", "topic_model.fit(cleaned_paper_texts)\n", "\n", "#Store Document-Topic Info\n", "doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n", "\n", "#Store Topic Info\n", "topic_info = pd.DataFrame(topic_model.get_topics())\n", "topic_info = topic_info.applymap(lambda x: x[0])\n", "topic_info.columns = ['topic_{}'.format(col+1) for col in topic_info.columns]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checkout Keywords for Each Topic" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | topic_0 | \n", "topic_1 | \n", "topic_2 | \n", "topic_3 | \n", "topic_4 | \n", "topic_5 | \n", "topic_6 | \n", "topic_7 | \n", "topic_8 | \n", "topic_9 | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "neural | \n", "language | \n", "state | \n", "function | \n", "cost | \n", "publication | \n", "graph | \n", "llama | \n", "tangkhul | \n", "want | \n", "
1 | \n", "network | \n", "natural | \n", "rnn | \n", "distribution | \n", "function | \n", "april | \n", "computation | \n", "like | \n", "compound | \n", "edu | \n", "
2 | \n", "function | \n", "model | \n", "memory | \n", "output | \n", "sgd | \n", "syst | \n", "node | \n", "south | \n", "root | \n", "dsontagcoursesinferenceslidespseudolikelihoodn... | \n", "
3 | \n", "networks | \n", "word | \n", "vector | \n", "class | \n", "training | \n", "technol | \n", "nodes | \n", "animal | \n", "morphological | \n", "regardlessly | \n", "
4 | \n", "one | \n", "planning | \n", "input | \n", "tanh | \n", "expected | \n", "date | \n", "backward | \n", "america | \n", "verbs | \n", "satisfied | \n", "
5 | \n", "input | \n", "words | \n", "network | \n", "data | \n", "optimization | \n", "vol | \n", "function | \n", "translation | \n", "noun | \n", "november | \n", "
6 | \n", "vector | \n", "based | \n", "recurrent | \n", "yˆ | \n", "algorithm | \n", "intell | \n", "backpropagation | \n", "french | \n", "roots | \n", "tune | \n", "
7 | \n", "language | \n", "processing | \n", "sequence | \n", "loss | \n", "set | \n", "acm | \n", "algorithm | \n", "cute | \n", "adjectives | \n", "return | \n", "
8 | \n", "model | \n", "models | \n", "neural | \n", "activation | \n", "validation | \n", "article | \n", "parameters | \n", "formation | \n", "fully | \n", "|
9 | \n", "training | \n", "data | \n", "lstm | \n", "softmax | \n", "rate | \n", "trans | \n", "output | \n", "domesticated | \n", "language | \n", "results | \n", "