LightRAG/lightrag/mineru_parser.py

514 lines
19 KiB
Python
Raw Normal View History

2025-06-05 17:37:11 +08:00
# type: ignore
2025-06-05 17:02:48 +08:00
"""
MinerU Document Parser Utility
This module provides functionality for parsing PDF, image and office documents using MinerU library,
and converts the parsing results into markdown and JSON formats
"""
from __future__ import annotations
__all__ = ["MineruParser"]
import os
import json
import argparse
from pathlib import Path
2025-06-05 17:37:11 +08:00
from typing import (
Dict,
List,
Optional,
Union,
Tuple,
Any,
TypeVar,
cast,
TYPE_CHECKING,
ClassVar,
)
2025-06-05 17:02:48 +08:00
# Type stubs for magic_pdf
FileBasedDataWriter = Any
FileBasedDataReader = Any
PymuDocDataset = Any
InferResult = Any
PipeResult = Any
SupportedPdfParseMethod = Any
doc_analyze = Any
read_local_office = Any
read_local_images = Any
if TYPE_CHECKING:
2025-06-05 17:37:11 +08:00
from magic_pdf.data.data_reader_writer import (
FileBasedDataWriter,
FileBasedDataReader,
)
2025-06-05 17:02:48 +08:00
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images
else:
# MinerU imports
2025-06-05 17:37:11 +08:00
from magic_pdf.data.data_reader_writer import (
FileBasedDataWriter,
FileBasedDataReader,
)
2025-06-05 17:02:48 +08:00
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images
2025-06-05 17:37:11 +08:00
T = TypeVar("T")
2025-06-05 17:02:48 +08:00
class MineruParser:
"""
MinerU document parsing utility class
Supports parsing PDF, image and office documents (like Word, PPT, etc.),
converting the content into structured data and generating markdown and JSON output
"""
__slots__: ClassVar[Tuple[str, ...]] = ()
def __init__(self) -> None:
"""Initialize MineruParser"""
pass
@staticmethod
2025-06-05 17:37:11 +08:00
def safe_write(
writer: Any,
content: Union[str, bytes, Dict[str, Any], List[Any]],
filename: str,
) -> None:
2025-06-05 17:02:48 +08:00
"""
Safely write content to a file, ensuring the filename is valid
Args:
writer: The writer object to use
content: The content to write
filename: The filename to write to
"""
# Ensure the filename isn't too long
if len(filename) > 200: # Most filesystems have limits around 255 characters
# Truncate the filename while keeping the extension
base, ext = os.path.splitext(filename)
filename = base[:190] + ext # Leave room for the extension and some margin
# Handle specific content types
if isinstance(content, str):
# Ensure str content is encoded to bytes if required
try:
writer.write(content, filename)
except TypeError:
# If the writer expects bytes, convert string to bytes
2025-06-05 17:37:11 +08:00
writer.write(content.encode("utf-8"), filename)
2025-06-05 17:02:48 +08:00
else:
# For dict/list content, always encode as JSON string first
if isinstance(content, (dict, list)):
try:
2025-06-05 17:37:11 +08:00
writer.write(
json.dumps(content, ensure_ascii=False, indent=4), filename
)
2025-06-05 17:02:48 +08:00
except TypeError:
# If the writer expects bytes, convert JSON string to bytes
2025-06-05 17:37:11 +08:00
writer.write(
json.dumps(content, ensure_ascii=False, indent=4).encode(
"utf-8"
),
filename,
)
2025-06-05 17:02:48 +08:00
else:
# Regular content (assumed to be bytes or compatible)
writer.write(content, filename)
@staticmethod
def parse_pdf(
pdf_path: Union[str, Path],
output_dir: Optional[str] = None,
2025-06-05 17:37:11 +08:00
use_ocr: bool = False,
2025-06-05 17:02:48 +08:00
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse PDF document
Args:
pdf_path: Path to the PDF file
output_dir: Output directory path
use_ocr: Whether to force OCR parsing
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
try:
# Convert to Path object for easier handling
pdf_path = Path(pdf_path)
name_without_suff = pdf_path.stem
# Prepare output directories - ensure file name is in path
if output_dir:
base_output_dir = Path(output_dir)
local_md_dir = base_output_dir / name_without_suff
else:
local_md_dir = pdf_path.parent / name_without_suff
local_image_dir = local_md_dir / "images"
image_dir = local_image_dir.name
# Create directories
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
# Initialize writers and reader
image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore
md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore
reader = FileBasedDataReader("") # type: ignore
# Read PDF bytes
pdf_bytes = reader.read(str(pdf_path)) # type: ignore
# Create dataset instance
ds = PymuDocDataset(pdf_bytes) # type: ignore
# Process based on PDF type and user preference
if use_ocr or ds.classify() == SupportedPdfParseMethod.OCR: # type: ignore
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
pipe_result = infer_result.pipe_ocr_mode(image_writer) # type: ignore
else:
infer_result = ds.apply(doc_analyze, ocr=False) # type: ignore
pipe_result = infer_result.pipe_txt_mode(image_writer) # type: ignore
# Draw visualizations
try:
2025-06-05 17:37:11 +08:00
infer_result.draw_model(
os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")
) # type: ignore
pipe_result.draw_layout(
os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")
) # type: ignore
pipe_result.draw_span(
os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")
) # type: ignore
2025-06-05 17:02:48 +08:00
except Exception as e:
print(f"Warning: Failed to draw visualizations: {str(e)}")
# Get data using API methods
md_content = pipe_result.get_markdown(image_dir) # type: ignore
content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save files using dump methods (consistent with API)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
2025-06-05 17:37:11 +08:00
pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
2025-06-05 17:02:48 +08:00
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing
model_inference_result = infer_result.get_infer_res() # type: ignore
json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4)
try:
# Try to write to a file manually to avoid FileBasedDataWriter issues
2025-06-05 17:37:11 +08:00
model_file_path = os.path.join(
local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
2025-06-05 17:02:48 +08:00
f.write(json_str)
except Exception as e:
2025-06-05 17:37:11 +08:00
print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
2025-06-05 17:02:48 +08:00
try:
# If direct file write fails, try using the writer with bytes encoding
2025-06-05 17:37:11 +08:00
md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
2025-06-05 17:02:48 +08:00
except Exception as e2:
2025-06-05 17:37:11 +08:00
print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
2025-06-05 17:02:48 +08:00
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
except Exception as e:
print(f"Error in parse_pdf: {str(e)}")
raise
@staticmethod
def parse_office_doc(
2025-06-05 17:37:11 +08:00
doc_path: Union[str, Path], output_dir: Optional[str] = None
2025-06-05 17:02:48 +08:00
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse office document (Word, PPT, etc.)
Args:
doc_path: Path to the document file
output_dir: Output directory path
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
try:
# Convert to Path object for easier handling
doc_path = Path(doc_path)
name_without_suff = doc_path.stem
# Prepare output directories - ensure file name is in path
if output_dir:
base_output_dir = Path(output_dir)
local_md_dir = base_output_dir / name_without_suff
else:
local_md_dir = doc_path.parent / name_without_suff
local_image_dir = local_md_dir / "images"
image_dir = local_image_dir.name
# Create directories
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
# Initialize writers
image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore
md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore
# Read office document
ds = read_local_office(str(doc_path))[0] # type: ignore
# Apply chain of operations according to API documentation
# This follows the pattern shown in MS-Office example in the API docs
2025-06-05 17:37:11 +08:00
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
) # type: ignore
2025-06-05 17:02:48 +08:00
# Re-execute for getting the content data
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
pipe_result = infer_result.pipe_txt_mode(image_writer) # type: ignore
# Get data for return values and additional outputs
md_content = pipe_result.get_markdown(image_dir) # type: ignore
content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save additional output files
2025-06-05 17:37:11 +08:00
pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
2025-06-05 17:02:48 +08:00
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing
model_inference_result = infer_result.get_infer_res() # type: ignore
json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4)
try:
# Try to write to a file manually to avoid FileBasedDataWriter issues
2025-06-05 17:37:11 +08:00
model_file_path = os.path.join(
local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
2025-06-05 17:02:48 +08:00
f.write(json_str)
except Exception as e:
2025-06-05 17:37:11 +08:00
print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
2025-06-05 17:02:48 +08:00
try:
# If direct file write fails, try using the writer with bytes encoding
2025-06-05 17:37:11 +08:00
md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
2025-06-05 17:02:48 +08:00
except Exception as e2:
2025-06-05 17:37:11 +08:00
print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
2025-06-05 17:02:48 +08:00
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
except Exception as e:
print(f"Error in parse_office_doc: {str(e)}")
raise
@staticmethod
def parse_image(
2025-06-05 17:37:11 +08:00
image_path: Union[str, Path], output_dir: Optional[str] = None
2025-06-05 17:02:48 +08:00
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse image document
Args:
image_path: Path to the image file
output_dir: Output directory path
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
try:
# Convert to Path object for easier handling
image_path = Path(image_path)
name_without_suff = image_path.stem
# Prepare output directories - ensure file name is in path
if output_dir:
base_output_dir = Path(output_dir)
local_md_dir = base_output_dir / name_without_suff
else:
local_md_dir = image_path.parent / name_without_suff
local_image_dir = local_md_dir / "images"
image_dir = local_image_dir.name
# Create directories
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
# Initialize writers
image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore
md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore
# Read image
ds = read_local_images(str(image_path))[0] # type: ignore
# Apply chain of operations according to API documentation
# This follows the pattern shown in Image example in the API docs
2025-06-05 17:37:11 +08:00
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
) # type: ignore
2025-06-05 17:02:48 +08:00
# Re-execute for getting the content data
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
pipe_result = infer_result.pipe_ocr_mode(image_writer) # type: ignore
# Get data for return values and additional outputs
md_content = pipe_result.get_markdown(image_dir) # type: ignore
content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save additional output files
2025-06-05 17:37:11 +08:00
pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
2025-06-05 17:02:48 +08:00
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing
model_inference_result = infer_result.get_infer_res() # type: ignore
json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4)
try:
# Try to write to a file manually to avoid FileBasedDataWriter issues
2025-06-05 17:37:11 +08:00
model_file_path = os.path.join(
local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
2025-06-05 17:02:48 +08:00
f.write(json_str)
except Exception as e:
2025-06-05 17:37:11 +08:00
print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
2025-06-05 17:02:48 +08:00
try:
# If direct file write fails, try using the writer with bytes encoding
2025-06-05 17:37:11 +08:00
md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
2025-06-05 17:02:48 +08:00
except Exception as e2:
2025-06-05 17:37:11 +08:00
print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
2025-06-05 17:02:48 +08:00
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
except Exception as e:
print(f"Error in parse_image: {str(e)}")
raise
@staticmethod
def parse_document(
file_path: Union[str, Path],
parse_method: str = "auto",
output_dir: Optional[str] = None,
2025-06-05 17:37:11 +08:00
save_results: bool = True,
2025-06-05 17:02:48 +08:00
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse document using MinerU based on file extension
Args:
file_path: Path to the file to be parsed
parse_method: Parsing method, supports "auto", "ocr", "txt", default is "auto"
output_dir: Output directory path, if None, use the directory of the input file
save_results: Whether to save parsing results to files
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
# Convert to Path object
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File does not exist: {file_path}")
# Get file extension
ext = file_path.suffix.lower()
# Choose appropriate parser based on file type
if ext in [".pdf"]:
return MineruParser.parse_pdf(
2025-06-05 17:37:11 +08:00
file_path, output_dir, use_ocr=(parse_method == "ocr")
2025-06-05 17:02:48 +08:00
)
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
2025-06-05 17:37:11 +08:00
return MineruParser.parse_image(file_path, output_dir)
2025-06-05 17:02:48 +08:00
elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
2025-06-05 17:37:11 +08:00
return MineruParser.parse_office_doc(file_path, output_dir)
2025-06-05 17:02:48 +08:00
else:
# For unsupported file types, default to PDF parsing
2025-06-05 17:37:11 +08:00
print(
f"Warning: Unsupported file extension '{ext}', trying generic PDF parser"
)
2025-06-05 17:02:48 +08:00
return MineruParser.parse_pdf(
2025-06-05 17:37:11 +08:00
file_path, output_dir, use_ocr=(parse_method == "ocr")
2025-06-05 17:02:48 +08:00
)
2025-06-05 17:37:11 +08:00
2025-06-05 17:02:48 +08:00
def main():
"""
Main function to run the MinerU parser from command line
"""
2025-06-05 17:37:11 +08:00
parser = argparse.ArgumentParser(description="Parse documents using MinerU")
parser.add_argument("file_path", help="Path to the document to parse")
parser.add_argument("--output", "-o", help="Output directory path")
parser.add_argument(
"--method",
"-m",
choices=["auto", "ocr", "txt"],
default="auto",
help="Parsing method (auto, ocr, txt)",
)
parser.add_argument(
"--stats", action="store_true", help="Display content statistics"
)
2025-06-05 17:02:48 +08:00
args = parser.parse_args()
try:
# Parse the document
content_list, md_content = MineruParser.parse_document(
2025-06-05 17:37:11 +08:00
file_path=args.file_path, parse_method=args.method, output_dir=args.output
2025-06-05 17:02:48 +08:00
)
# Display statistics if requested
if args.stats:
print("\nDocument Statistics:")
print(f"Total content blocks: {len(content_list)}")
2025-06-05 17:37:11 +08:00
2025-06-05 17:02:48 +08:00
# Count different types of content
content_types = {}
for item in content_list:
2025-06-05 17:37:11 +08:00
content_type = item.get("type", "unknown")
2025-06-05 17:02:48 +08:00
content_types[content_type] = content_types.get(content_type, 0) + 1
2025-06-05 17:37:11 +08:00
2025-06-05 17:02:48 +08:00
print("\nContent Type Distribution:")
for content_type, count in content_types.items():
print(f"- {content_type}: {count}")
except Exception as e:
print(f"Error: {str(e)}")
return 1
return 0
2025-06-05 17:37:11 +08:00
if __name__ == "__main__":
2025-06-05 17:02:48 +08:00
exit(main())