mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
docs: update readme and add ASR example (#1836)
* updated the README Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added minimal_asr_pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Updated README and added ASR example Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Updated docs.index.md Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated CI and mkdocs Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added link tp existing audio file Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added link tp existing audio file Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatting Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
1557e7ce3e
commit
f3ae3029b8
2
.github/workflows/checks.yml
vendored
2
.github/workflows/checks.yml
vendored
@ -60,7 +60,7 @@ jobs:
|
||||
run: |
|
||||
for file in docs/examples/*.py; do
|
||||
# Skip batch_convert.py
|
||||
if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
|
||||
if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
|
||||
echo "Skipping $file"
|
||||
continue
|
||||
fi
|
||||
|
@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
||||
|
||||
## Features
|
||||
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
||||
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
||||
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
||||
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
|
||||
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
||||
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
||||
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
||||
* 🔍 Extensive OCR support for scanned PDFs and images
|
||||
* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
||||
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
||||
* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
|
||||
* 💻 Simple and convenient CLI
|
||||
|
||||
### Coming soon
|
||||
|
56
docs/examples/minimal_asr_pipeline.py
vendored
Normal file
56
docs/examples/minimal_asr_pipeline.py
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
from docling.datamodel import asr_model_specs
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AsrPipelineOptions
|
||||
from docling.document_converter import AudioFormatOption, DocumentConverter
|
||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||
|
||||
|
||||
def get_asr_converter():
|
||||
"""Create a DocumentConverter configured for ASR with whisper_turbo model."""
|
||||
pipeline_options = AsrPipelineOptions()
|
||||
pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.AUDIO: AudioFormatOption(
|
||||
pipeline_cls=AsrPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
}
|
||||
)
|
||||
return converter
|
||||
|
||||
|
||||
def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument:
|
||||
"""ASR pipeline conversion using whisper_turbo"""
|
||||
# Check if the test audio file exists
|
||||
assert audio_path.exists(), f"Test audio file not found: {audio_path}"
|
||||
|
||||
converter = get_asr_converter()
|
||||
|
||||
# Convert the audio file
|
||||
result: ConversionResult = converter.convert(audio_path)
|
||||
|
||||
# Verify conversion was successful
|
||||
assert result.status == ConversionStatus.SUCCESS, (
|
||||
f"Conversion failed with status: {result.status}"
|
||||
)
|
||||
return result.document
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
audio_path = Path("tests/data/audio/sample_10s.mp3")
|
||||
|
||||
doc = asr_pipeline_conversion(audio_path=audio_path)
|
||||
print(doc.export_to_markdown())
|
||||
|
||||
# Expected output:
|
||||
#
|
||||
# [time: 0.0-4.0] Shakespeare on Scenery by Oscar Wilde
|
||||
#
|
||||
# [time: 5.28-9.96] This is a LibriVox recording. All LibriVox recordings are in the public domain.
|
7
docs/index.md
vendored
7
docs/index.md
vendored
@ -20,14 +20,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
||||
|
||||
## Features
|
||||
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
||||
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
||||
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
||||
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
|
||||
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
||||
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
||||
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
||||
* 🔍 Extensive OCR support for scanned PDFs and images
|
||||
* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🔥
|
||||
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
||||
* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
|
||||
* 💻 Simple and convenient CLI
|
||||
|
||||
### Coming soon
|
||||
|
@ -80,6 +80,7 @@ nav:
|
||||
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
|
||||
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
|
||||
- "VLM comparison": examples/compare_vlm_models.py
|
||||
- "ASR pipeline with Whisper": examples/minimal_asr_pipeline.py
|
||||
- "Figure export": examples/export_figures.py
|
||||
- "Table export": examples/export_tables.py
|
||||
- "Multimodal export": examples/export_multimodal.py
|
||||
|
Loading…
x
Reference in New Issue
Block a user