mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-27 18:07:57 +00:00
feat(ingest/excel): Add Excel Source
This commit is contained in:
parent
71b05c6020
commit
d5eb2e7d6a
@ -198,7 +198,7 @@ class ExcelFile:
|
||||
row_count,
|
||||
column_count,
|
||||
metadata,
|
||||
sheet.title,
|
||||
sheet.title.strip(),
|
||||
)
|
||||
|
||||
def find_header_row(self, rows: List[List[Any]]) -> int:
|
||||
|
||||
BIN
metadata-ingestion/tests/unit/excel/data/file_1.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_1.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_2.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_2.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_3.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_3.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_4.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_4.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_5.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_5.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_6.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_6.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_7.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_7.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_8.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_8.xlsx
Normal file
Binary file not shown.
BIN
metadata-ingestion/tests/unit/excel/data/file_9.xlsx
Normal file
BIN
metadata-ingestion/tests/unit/excel/data/file_9.xlsx
Normal file
Binary file not shown.
43
metadata-ingestion/tests/unit/excel/test_excel_samples.py
Normal file
43
metadata-ingestion/tests/unit/excel/test_excel_samples.py
Normal file
@ -0,0 +1,43 @@
|
||||
import io
|
||||
|
||||
from datahub.ingestion.source.excel.excel_file import ExcelFile
|
||||
from datahub.ingestion.source.excel.report import ExcelSourceReport
|
||||
|
||||
|
||||
def test_sample_files(pytestconfig):
|
||||
file_names = [
|
||||
("file_1.xlsx", "Monthly Reporting", 1, 5, 4, 17),
|
||||
("file_1.xlsx", "Dec", 1, 4, 3, 14),
|
||||
("file_1.xlsx", "Jan", 1, 5, 4, 14),
|
||||
("file_1.xlsx", "Feb", 1, 5, 4, 14),
|
||||
("file_2.xlsx", "Test Group Reporting ", 1, 19, 18, 46),
|
||||
("file_3.xlsx", "Sheet1", 1, 5, 4, 209),
|
||||
("file_4.xlsx", "in", 1, 3, 2, 252),
|
||||
("file_5.xlsx", "Test1_Test", 4, 8, 4, 24),
|
||||
("file_6.xlsx", "Test2_Test", 2, 6, 4, 24),
|
||||
("file_7.xlsx", "12345678 (Current Month)", 1, 4, 3, 68),
|
||||
("file_8.xlsx", "Test3_Test", 4, 8, 4, 24),
|
||||
("file_9.xlsx", "Business Report", 6, 11, 5, 5),
|
||||
]
|
||||
test_resources_dir = pytestconfig.rootpath / "tests/unit/excel"
|
||||
|
||||
for file_name, sheet, header, footer, rows, columns in file_names:
|
||||
sample_file = test_resources_dir / f"data/{file_name}"
|
||||
report = ExcelSourceReport()
|
||||
|
||||
assert sample_file.exists()
|
||||
|
||||
with open(sample_file, "rb") as f:
|
||||
file_content = f.read()
|
||||
bytes_io = io.BytesIO(file_content)
|
||||
|
||||
xls = ExcelFile(file_name, bytes_io, report)
|
||||
result = xls.load_workbook()
|
||||
assert result is True
|
||||
|
||||
table = xls.get_table(sheet)
|
||||
|
||||
assert table.header_row == header
|
||||
assert table.footer_row == footer
|
||||
assert table.row_count == rows
|
||||
assert table.column_count == columns
|
||||
Loading…
x
Reference in New Issue
Block a user