{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import General Use Packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import arxiv # Interact with arXiv api to scrape papers\n", "from sentence_transformers import SentenceTransformer # Use Hugging Face Embedding for Topic Modelling\n", "from bertopic import BERTopic # Package for Topic Modelling\n", "from tqdm import tqdm #Progress Bar When Iterating\n", "import glob #Identify Files in Directory\n", "import os #Delete Files in Directory\n", "import pandas as pd #Dataframe Manipulation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Bricks" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from unstructured.partition.auto import partition #Base Function to Partition PDF\n", "from unstructured.staging.base import convert_to_dict #Convert List Unstructured Elements Into List of Dicts for Easy Parsing\n", "from unstructured.cleaners.core import clean, remove_punctuation, clean_non_ascii_chars #Cleaning Bricks\n", "import re #Create Custom Cleaning Brick\n", "import nltk #Toolkit for more advanced pre-processing\n", "from nltk.corpus import stopwords #list of stopwords to remove\n", "from typing import List #Type Hinting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup NLTK" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/pravinsanthanam/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('stopwords')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create Function to Extract PDFs About Machine Learning from arXiv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:\n", " \"\"\"Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process\n", "\n", " Args:\n", " query (str): query for arXiv API\n", " max_results (int, optional): Number of Papers to get back. Defaults to 100.\n", "\n", " Returns:\n", " paper_texts (list[str]): Return list of narrative texts for each paper\n", " \"\"\"\n", " #Get List of Arxiv Papers Matching Our Query\n", " arxiv_papers = list(\n", " arxiv.Search(\n", " query = query,\n", " max_results = max_results,\n", " sort_by = arxiv.SortCriterion.Relevance,\n", " sort_order = arxiv.SortOrder.Descending\n", " )\n", " .results()\n", " )\n", "\n", " #Loop Through PDFs, Download and Pre-Process and Then Delete\n", " paper_texts = []\n", " for paper in tqdm(arxiv_papers):\n", " paper.download_pdf()\n", " pdf_file = glob.glob('*.pdf')[0]\n", " elements = partition(pdf_file) #Partition PDF Using Unstructured\n", " isd = convert_to_dict(elements) #Convert List of Elements to List of Dictionaries\n", " narrative_texts = [element['text'] for element in isd if element['type'] == 'NarrativeText'] #Only Keep Narrative Text and Combine Into One String\n", " os.remove(pdf_file) #Delete PDF\n", " paper_texts += narrative_texts\n", " return paper_texts\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10/10 [04:59<00:00, 29.92s/it]\n" ] } ], "source": [ "paper_texts = get_arxiv_paper_texts(query='natural language processing', max_results=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Narrative Texts Through Custom Cleaner Brick Using Unstructured" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Narrative Texts to Run Through Topic Modelling: 1711\n" ] } ], "source": [ "#Stopwords to Remove\n", "stop_words = set(stopwords.words('english'))\n", "\n", "#Function to Apply Whatever Cleaning Brick Functionality to Each Narrative Text Element\n", "def custom_clean_brick(narrative_text: str) -> str:\n", " \"\"\"Apply Mix of Unstructured Cleaning Bricks With Some Custom Functionality to Pre-Process Narrative Text\n", "\n", " Args:\n", " narrative_text (str): Narrative Text or Any Other Sentence\n", "\n", " Returns:\n", " cleaned_text (str): Text after going through all the cleaning procedures\n", " \"\"\"\n", " remove_numbers = lambda text: re.sub(r'\\d+', \"\", text) #lambda function to remove all punctuation\n", " cleaned_text = remove_numbers(narrative_text) #Apply Custom Lambda\n", " cleaned_text = clean(cleaned_text, extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True, lowercase=True) #Apply Basic Clean Brick With All the Options\n", " cleaned_text = remove_punctuation(cleaned_text) #Remove all punctuation\n", " cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words]) #remove stop words\n", " return cleaned_text\n", "\n", "#Apply Function to Paper Texts\n", "cleaned_paper_texts = [custom_clean_brick(text) for text in paper_texts]\n", "\n", "#Count Narratve Texts\n", "print(\"Number of Narrative Texts to Run Through Topic Modelling: {}\".format(len(cleaned_paper_texts)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#Choose Which Hugging Face Model You Want to Use\n", "sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "#Initialize Model\n", "topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a6ebe3cb185049bd8d37742f2451cbe0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/54 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings\n", "2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality\n", "2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings\n", "2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10\n" ] } ], "source": [ "#Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities\n", "topic_model.fit(cleaned_paper_texts)\n", "\n", "#Store Document-Topic Info\n", "doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)\n", "\n", "#Store Topic Info\n", "topic_info = pd.DataFrame(topic_model.get_topics())\n", "topic_info = topic_info.applymap(lambda x: x[0])\n", "topic_info.columns = ['topic_{}'.format(col+1) for col in topic_info.columns]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checkout Keywords for Each Topic" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | topic_0\n", " | topic_1\n", " | topic_2\n", " | topic_3\n", " | topic_4\n", " | topic_5\n", " | topic_6\n", " | topic_7\n", " | topic_8\n", " | topic_9\n", " | 
|---|---|---|---|---|---|---|---|---|---|---|
| 0\n", " | neural\n", " | language\n", " | state\n", " | function\n", " | cost\n", " | publication\n", " | graph\n", " | llama\n", " | tangkhul\n", " | want\n", " | 
| 1\n", " | network\n", " | natural\n", " | rnn\n", " | distribution\n", " | function\n", " | april\n", " | computation\n", " | like\n", " | compound\n", " | edu\n", " | 
| 2\n", " | function\n", " | model\n", " | memory\n", " | output\n", " | sgd\n", " | syst\n", " | node\n", " | south\n", " | root\n", " | dsontagcoursesinferenceslidespseudolikelihoodn...\n", " | 
| 3\n", " | networks\n", " | word\n", " | vector\n", " | class\n", " | training\n", " | technol\n", " | nodes\n", " | animal\n", " | morphological\n", " | regardlessly\n", " | 
| 4\n", " | one\n", " | planning\n", " | input\n", " | tanh\n", " | expected\n", " | date\n", " | backward\n", " | america\n", " | verbs\n", " | satisfied\n", " | 
| 5\n", " | input\n", " | words\n", " | network\n", " | data\n", " | optimization\n", " | vol\n", " | function\n", " | translation\n", " | noun\n", " | november\n", " | 
| 6\n", " | vector\n", " | based\n", " | recurrent\n", " | yˆ\n", " | algorithm\n", " | intell\n", " | backpropagation\n", " | french\n", " | roots\n", " | tune\n", " | 
| 7\n", " | language\n", " | processing\n", " | sequence\n", " | loss\n", " | set\n", " | acm\n", " | algorithm\n", " | cute\n", " | adjectives\n", " | return\n", " | 
| 8\n", " | model\n", " | models\n", " | neural\n", " | activation\n", " | validation\n", " | article\n", " | parameters\n", " | formation\n", " | fully\n", " | |
| 9\n", " | training\n", " | data\n", " | lstm\n", " | softmax\n", " | rate\n", " | trans\n", " | output\n", " | domesticated\n", " | language\n", " | results\n", " |