Add CSV to Markdown table conversion - fixes #1144 (#1176)

* feat: Add CSV to Markdown table converter

- Add new CsvConverter class to convert CSV files to Markdown tables\n- Support text/csv and application/csv MIME types\n- Preserve table structure with headers and data rows\n- Handle edge cases like empty cells and mismatched columns\n- Fix Azure Document Intelligence dependency handling\n- Register CsvConverter in MarkItDown class

----

Thanks also to @benny123tw who submitted a very similar PR in #1171
This commit is contained in:
Turdıbek 2025-04-13 21:19:00 +05:00 committed by GitHub
parent 3fcd48cdfc
commit 8576f1d915
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 111 additions and 6 deletions

View File

@ -41,6 +41,7 @@ from .converters import (
ZipConverter,
EpubConverter,
DocumentIntelligenceConverter,
CsvConverter,
)
from ._base_converter import DocumentConverter, DocumentConverterResult
@ -194,6 +195,7 @@ class MarkItDown:
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")

View File

@ -22,6 +22,7 @@ from ._doc_intel_converter import (
DocumentIntelligenceFileType,
)
from ._epub_converter import EpubConverter
from ._csv_converter import CsvConverter
__all__ = [
"PlainTextConverter",
@ -43,4 +44,5 @@ __all__ = [
"DocumentIntelligenceConverter",
"DocumentIntelligenceFileType",
"EpubConverter",
"CsvConverter",
]

View File

@ -0,0 +1,79 @@
import sys
import csv
import io
from typing import BinaryIO, Any
from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv",
"application/csv",
]
ACCEPTED_FILE_EXTENSIONS = [".csv"]
class CsvConverter(DocumentConverter):
"""
Converts CSV files to Markdown tables.
"""
def __init__(self):
super().__init__()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Read the file content
if stream_info.charset:
content = file_stream.read().decode(stream_info.charset)
else:
content = str(from_bytes(file_stream.read()).best())
# Parse CSV content
reader = csv.reader(io.StringIO(content))
rows = list(reader)
if not rows:
return DocumentConverterResult(markdown="")
# Create markdown table
markdown_table = []
# Add header row
markdown_table.append("| " + " | ".join(rows[0]) + " |")
# Add separator row
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
# Add data rows
for row in rows[1:]:
# Make sure row has the same number of columns as header
while len(row) < len(rows[0]):
row.append("")
# Truncate if row has more columns than header
row = row[: len(rows[0])]
markdown_table.append("| " + " | ".join(row) + " |")
result = "\n".join(markdown_table)
return DocumentConverterResult(markdown=result)

View File

@ -1,8 +1,7 @@
import sys
import re
import os
from typing import BinaryIO, Any, List
from typing import BinaryIO, Any, List, Optional, Union
from enum import Enum
from ._html_converter import HtmlConverter
@ -26,6 +25,28 @@ except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
# Define these types for type hinting when the package is not available
class AzureKeyCredential:
pass
class TokenCredential:
pass
class DocumentIntelligenceClient:
pass
class AnalyzeDocumentRequest:
pass
class AnalyzeResult:
pass
class DocumentAnalysisFeature:
pass
class DefaultAzureCredential:
pass
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved.

View File

@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [
charset="cp932",
url=None,
must_include=[
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
"| 名前 | 年齢 | 住所 |",
"| --- | --- | --- |",
"| 佐藤太郎 | 30 | 東京 |",
"| 三木英子 | 25 | 大阪 |",
"| 髙橋淳 | 35 | 名古屋 |",
],
must_not_include=[],
),