mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-08-31 12:39:01 +00:00
* Fix #6700: Add support for table properties: file format for datalake & storage * pylint fix * resolve review comments
This commit is contained in:
parent
d140883747
commit
f6d5c7413f
@ -60,7 +60,7 @@ from metadata.readers.dataframe.models import DatalakeTableSchemaWrapper
|
|||||||
from metadata.readers.dataframe.reader_factory import SupportedTypes
|
from metadata.readers.dataframe.reader_factory import SupportedTypes
|
||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR, DEFAULT_DATABASE
|
from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR, DEFAULT_DATABASE
|
||||||
from metadata.utils.datalake.datalake_utils import fetch_dataframe
|
from metadata.utils.datalake.datalake_utils import fetch_dataframe, get_file_format_type
|
||||||
from metadata.utils.filters import filter_by_schema, filter_by_table
|
from metadata.utils.filters import filter_by_schema, filter_by_table
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
from metadata.utils.s3_utils import list_s3_objects
|
from metadata.utils.s3_utils import list_s3_objects
|
||||||
@ -238,7 +238,7 @@ class DatalakeSource(DatabaseServiceSource):
|
|||||||
|
|
||||||
def get_tables_name_and_type( # pylint: disable=too-many-branches
|
def get_tables_name_and_type( # pylint: disable=too-many-branches
|
||||||
self,
|
self,
|
||||||
) -> Optional[Iterable[Tuple[str, str]]]:
|
) -> Iterable[Tuple[str, TableType]]:
|
||||||
"""
|
"""
|
||||||
Handle table and views.
|
Handle table and views.
|
||||||
|
|
||||||
@ -371,6 +371,7 @@ class DatalakeSource(DatabaseServiceSource):
|
|||||||
bucket_name=schema_name,
|
bucket_name=schema_name,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# If no data_frame (due to unsupported type), ignore
|
# If no data_frame (due to unsupported type), ignore
|
||||||
columns = self.get_columns(data_frame[0]) if data_frame else None
|
columns = self.get_columns(data_frame[0]) if data_frame else None
|
||||||
if columns:
|
if columns:
|
||||||
@ -380,6 +381,7 @@ class DatalakeSource(DatabaseServiceSource):
|
|||||||
columns=columns,
|
columns=columns,
|
||||||
tableConstraints=table_constraints if table_constraints else None,
|
tableConstraints=table_constraints if table_constraints else None,
|
||||||
databaseSchema=self.context.database_schema.fullyQualifiedName,
|
databaseSchema=self.context.database_schema.fullyQualifiedName,
|
||||||
|
fileFormat=get_file_format_type(table_name),
|
||||||
)
|
)
|
||||||
yield table_request
|
yield table_request
|
||||||
self.register_record(table_request=table_request)
|
self.register_record(table_request=table_request)
|
||||||
@ -552,7 +554,7 @@ class DatalakeSource(DatabaseServiceSource):
|
|||||||
complex_col_dict.clear()
|
complex_col_dict.clear()
|
||||||
return cols
|
return cols
|
||||||
|
|
||||||
def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
|
def yield_view_lineage(self) -> Iterable[AddLineageRequest]:
|
||||||
yield from []
|
yield from []
|
||||||
|
|
||||||
def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndClassification]:
|
def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndClassification]:
|
||||||
|
@ -145,6 +145,7 @@ class S3Source(StorageServiceSource):
|
|||||||
service=self.context.objectstore_service.fullyQualifiedName,
|
service=self.context.objectstore_service.fullyQualifiedName,
|
||||||
parent=container_details.parent,
|
parent=container_details.parent,
|
||||||
sourceUrl=container_details.sourceUrl,
|
sourceUrl=container_details.sourceUrl,
|
||||||
|
fileFormats=container_details.file_formats,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_container_details(
|
def _generate_container_details(
|
||||||
|
@ -62,3 +62,16 @@ def fetch_dataframe(
|
|||||||
raise err
|
raise err
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_format_type(
|
||||||
|
key: str,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Method to get file format type
|
||||||
|
"""
|
||||||
|
for supported_type in SupportedTypes:
|
||||||
|
if key.endswith(supported_type.value):
|
||||||
|
return supported_type.value
|
||||||
|
|
||||||
|
return None
|
||||||
|
@ -923,6 +923,7 @@ public class TableResource extends EntityResource<Table, TableRepository> {
|
|||||||
.withTablePartition(create.getTablePartition())
|
.withTablePartition(create.getTablePartition())
|
||||||
.withTableType(create.getTableType())
|
.withTableType(create.getTableType())
|
||||||
.withTags(create.getTags())
|
.withTags(create.getTags())
|
||||||
|
.withFileFormat(create.getFileFormat())
|
||||||
.withViewDefinition(create.getViewDefinition())
|
.withViewDefinition(create.getViewDefinition())
|
||||||
.withTableProfilerConfig(create.getTableProfilerConfig())
|
.withTableProfilerConfig(create.getTableProfilerConfig())
|
||||||
.withDatabaseSchema(getEntityReference(Entity.DATABASE_SCHEMA, create.getDatabaseSchema())))
|
.withDatabaseSchema(getEntityReference(Entity.DATABASE_SCHEMA, create.getDatabaseSchema())))
|
||||||
|
@ -88,6 +88,10 @@
|
|||||||
"items" : {
|
"items" : {
|
||||||
"$ref" : "../../type/basic.json#/definitions/fullyQualifiedEntityName"
|
"$ref" : "../../type/basic.json#/definitions/fullyQualifiedEntityName"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"fileFormat": {
|
||||||
|
"description": "File format in case of file/datalake tables.",
|
||||||
|
"$ref": "../../entity/data/table.json#/definitions/fileFormat"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["name", "columns", "databaseSchema"],
|
"required": ["name", "columns", "databaseSchema"],
|
||||||
|
@ -860,6 +860,11 @@
|
|||||||
"sql"
|
"sql"
|
||||||
],
|
],
|
||||||
"additionalProperties": false
|
"additionalProperties": false
|
||||||
|
},
|
||||||
|
"fileFormat": {
|
||||||
|
"description": "File format in case of file/datalake tables.",
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["csv", "tsv", "avro", "parquet", "json", "json.gz", "json.zip"]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"properties": {
|
"properties": {
|
||||||
@ -1023,6 +1028,10 @@
|
|||||||
"dataProducts" : {
|
"dataProducts" : {
|
||||||
"description": "List of data products this entity is part of.",
|
"description": "List of data products this entity is part of.",
|
||||||
"$ref" : "../../type/entityReferenceList.json"
|
"$ref" : "../../type/entityReferenceList.json"
|
||||||
|
},
|
||||||
|
"fileFormat": {
|
||||||
|
"description": "File format in case of file/datalake tables.",
|
||||||
|
"$ref" : "#/definitions/fileFormat"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
|
Loading…
x
Reference in New Issue
Block a user