llama-hub/loader_hub/wordpress/base.py

"""Wordpress reader."""
from typing import List
import json
import requests
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
from bs4 import BeautifulSoup


class WordpressReader(BaseReader):
    """Wordpress reader. Reads data from a Wordpress workspace.

    Args:
        wordpress_subdomain (str): Wordpress subdomain
    """

    def __init__(self, url: str, password: str, username: str) -> None:
        """Initialize Wordpress reader."""
        self.url = url
        self.username = username
        self.password = password

    def load_data(self) -> List[Document]:
        """Load data from the workspace.
        Returns:
            List[Document]: List of documents.
        """
        results = []

        articles = self.get_all_posts()

        for article in articles:
            body = article.get('content', {}).get('rendered', None)
            if not body:
                body = article.get("content")

            soup = BeautifulSoup(body, 'html.parser')
            body = soup.get_text()

            title = article.get("title", {}).get('rendered', None)
            if not title:
                title = article.get("title")

            extra_info = {
                "id": article['id'],
                "title": title,
                "url": article['link'],
                "updated_at": article['modified']
            }

            results.append(
                Document(
                    body,
                    extra_info=extra_info,
                )
            )
        return results

    def get_all_posts(self):
        posts = []
        next_page = 1

        while True:
            response = self.get_posts_page(next_page)
            posts.extend(response['articles'])
            next_page = response['next_page']

            if next_page is None:
                break

        return posts

    def get_posts_page(self, current_page: int = 1):
        url = f"{self.url}/wp-json/wp/v2/posts?per_page=100&page={current_page}"

        response = requests.get(url)
        headers = response.headers

        if 'X-WP-TotalPages' in headers:
            num_pages = int(headers['X-WP-TotalPages'])
        else:
            num_pages = 1

        if num_pages > current_page:
            next_page = current_page + 1
        else:
            next_page = None

        response_json = json.loads(response.text)

        articles = response_json

        return {
            "articles": articles,
            "next_page": next_page
        }