feat(ingest): upgrade feast (#6186)

Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
cburroughs 2022-11-03 14:23:19 -04:00 committed by GitHub
parent c3c2b2715b
commit 39c84c2f5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 106 additions and 70 deletions

View File

@ -1,4 +1,5 @@
import os
import sys
from typing import Dict, Set
import setuptools
@ -95,8 +96,9 @@ kafka_common = {
kafka_protobuf = {
"networkx>=2.6.2",
# Required to generate protobuf python modules from the schema downloaded from the schema registry
"grpcio==1.44.0",
"grpcio-tools==1.44.0",
# NOTE: potential conflict with feast also depending on grpcio
"grpcio>=1.44.0,<2",
"grpcio-tools>=1.44.0,<2",
}
sql_common = {
@ -254,7 +256,7 @@ plugins: Dict[str, Set[str]] = {
# https://github.com/elastic/elasticsearch-py/issues/1639#issuecomment-883587433
"elasticsearch": {"elasticsearch==7.13.4"},
"feast-legacy": {"docker"},
"feast": {"feast==0.18.0", "flask-openid>=1.3.0"},
"feast": {"feast~=0.26.0", "flask-openid>=1.3.0"},
"glue": aws_common,
# hdbcli is supported officially by SAP, sqlalchemy-hana is built on top but not officially supported
"hana": sql_common
@ -397,7 +399,7 @@ base_dev_requirements = {
"delta-lake",
"druid",
"elasticsearch",
"feast",
"feast" if sys.version_info >= (3, 8) else None,
"iceberg",
"ldap",
"looker",
@ -425,6 +427,7 @@ base_dev_requirements = {
"unity-catalog"
# airflow is added below
]
if plugin
for dependency in plugins[plugin]
),
}
@ -444,7 +447,6 @@ full_test_dev_requirements = {
"clickhouse",
"delta-lake",
"druid",
"feast",
"feast-legacy",
"hana",
"hive",

View File

@ -1,19 +1,27 @@
from dataclasses import dataclass
from typing import Dict, Iterable, List, Tuple, Union
import sys
if sys.version_info < (3, 8):
raise ImportError("Feast is only supported on Python 3.8+")
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Tuple, Union
import feast.types
from feast import (
BigQuerySource,
Entity,
Feature,
FeatureStore,
FeatureView,
Field as FeastField,
FileSource,
KafkaSource,
KinesisSource,
OnDemandFeatureView,
RequestSource,
SnowflakeSource,
ValueType,
)
from feast.data_source import DataSource, RequestDataSource
from feast.data_source import DataSource
from pydantic import Field
import datahub.emitter.mce_builder as builder
@ -46,7 +54,7 @@ from datahub.metadata.schema_classes import (
)
# FIXME: ValueType module cannot be used as a type
_field_type_mapping: Dict[ValueType, str] = {
_field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
ValueType.UNKNOWN: MLFeatureDataType.UNKNOWN,
ValueType.BYTES: MLFeatureDataType.BYTE,
ValueType.STRING: MLFeatureDataType.TEXT,
@ -65,11 +73,26 @@ _field_type_mapping: Dict[ValueType, str] = {
ValueType.BOOL_LIST: MLFeatureDataType.SEQUENCE,
ValueType.UNIX_TIMESTAMP_LIST: MLFeatureDataType.SEQUENCE,
ValueType.NULL: MLFeatureDataType.UNKNOWN,
feast.types.Invalid: MLFeatureDataType.UNKNOWN,
feast.types.Bytes: MLFeatureDataType.BYTE,
feast.types.String: MLFeatureDataType.TEXT,
feast.types.Int32: MLFeatureDataType.ORDINAL,
feast.types.Int64: MLFeatureDataType.ORDINAL,
feast.types.Float64: MLFeatureDataType.CONTINUOUS,
feast.types.Float32: MLFeatureDataType.CONTINUOUS,
feast.types.Bool: MLFeatureDataType.BINARY,
feast.types.UnixTimestamp: MLFeatureDataType.TIME,
feast.types.Array: MLFeatureDataType.SEQUENCE, # type: ignore
feast.types.Invalid: MLFeatureDataType.UNKNOWN,
}
class FeastRepositorySourceConfig(ConfigModel):
path: str = Field(description="Path to Feast repository")
fs_yaml_file: Optional[str] = Field(
default=None,
description="Path to the `feature_store.yaml` file used to configure the feature store",
)
environment: str = Field(
default=DEFAULT_ENV, description="Environment to use when constructing URNs"
)
@ -85,7 +108,7 @@ class FeastRepositorySource(Source):
This plugin extracts:
- Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
- Features as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
- Fields as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
- Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
- Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
- Column types associated with each entity and feature
@ -97,12 +120,16 @@ class FeastRepositorySource(Source):
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
super().__init__(ctx)
self.source_config = config
self.report = SourceReport()
self.feature_store = FeatureStore(self.source_config.path)
self.feature_store = FeatureStore(
repo_path=self.source_config.path,
fs_yaml_file=self.source_config.fs_yaml_file,
)
def _get_field_type(self, field_type: ValueType, parent_name: str) -> str:
def _get_field_type(
self, field_type: Union[ValueType, feast.types.FeastType], parent_name: str
) -> str:
"""
Maps types encountered in Feast to corresponding schema types.
"""
@ -128,26 +155,24 @@ class FeastRepositorySource(Source):
if isinstance(source, FileSource):
platform = "file"
name = source.path.replace("://", ".").replace("/", ".")
if isinstance(source, BigQuerySource):
elif isinstance(source, BigQuerySource):
platform = "bigquery"
name = source.table
if isinstance(source, KafkaSource):
elif isinstance(source, KafkaSource):
platform = "kafka"
name = source.kafka_options.topic
if isinstance(source, KinesisSource):
elif isinstance(source, KinesisSource):
platform = "kinesis"
name = (
f"{source.kinesis_options.region}:{source.kinesis_options.stream_name}"
)
if isinstance(source, RequestDataSource):
elif isinstance(source, RequestSource):
platform = "request"
name = source.name
elif isinstance(source, SnowflakeSource):
platform = "snowflake"
name = source.table
return platform, name
@ -214,7 +239,7 @@ class FeastRepositorySource(Source):
self,
# FIXME: FeatureView and OnDemandFeatureView cannot be used as a type
feature_view: Union[FeatureView, OnDemandFeatureView],
feature: Feature,
field: FeastField,
) -> MetadataWorkUnit:
"""
Generate an MLFeature work unit for a Feast feature.
@ -222,17 +247,16 @@ class FeastRepositorySource(Source):
feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
feature_snapshot = MLFeatureSnapshot(
urn=builder.make_ml_feature_urn(feature_view_name, feature.name),
urn=builder.make_ml_feature_urn(feature_view_name, field.name),
aspects=[StatusClass(removed=False)],
)
feature_sources = []
if isinstance(feature_view, FeatureView):
feature_sources = self._get_data_sources(feature_view)
elif isinstance(feature_view, OnDemandFeatureView):
if feature_view.input_request_data_sources is not None:
for request_source in feature_view.input_request_data_sources.values():
if feature_view.source_request_sources is not None:
for request_source in feature_view.source_request_sources.values():
source_platform, source_name = self._get_data_source_details(
request_source
)
@ -245,10 +269,10 @@ class FeastRepositorySource(Source):
)
)
if feature_view.input_feature_view_projections is not None:
if feature_view.source_feature_view_projections is not None:
for (
feature_view_projection
) in feature_view.input_feature_view_projections.values():
) in feature_view.source_feature_view_projections.values():
feature_view_source = self.feature_store.get_feature_view(
feature_view_projection.name
)
@ -257,15 +281,15 @@ class FeastRepositorySource(Source):
feature_snapshot.aspects.append(
MLFeaturePropertiesClass(
description=feature.labels.get("description"),
dataType=self._get_field_type(feature.dtype, feature.name),
description=field.tags.get("description"),
dataType=self._get_field_type(field.dtype, field.name),
sources=feature_sources,
)
)
mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
return MetadataWorkUnit(id=feature.name, mce=mce)
return MetadataWorkUnit(id=field.name, mce=mce)
def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit:
"""
@ -359,8 +383,8 @@ class FeastRepositorySource(Source):
yield work_unit
for feature in feature_view.features:
work_unit = self._get_feature_workunit(feature_view, feature)
for field in feature_view.features:
work_unit = self._get_feature_workunit(feature_view, field)
self.report.report_workunit(work_unit)
yield work_unit

View File

@ -5,6 +5,4 @@ online_store:
path: data/online_store.db
offline_store:
type: file
flags:
alpha_features: true
on_demand_transforms: true
entity_key_serialization_version: 2

View File

@ -1,63 +1,74 @@
from datetime import timedelta
import feast.types
import pandas as pd
from feast import Entity, Feature, FeatureView, FileSource, ValueType
from feast.data_source import RequestDataSource
from feast import Entity, FeatureView, Field, FileSource, RequestSource, ValueType
from feast.on_demand_feature_view import on_demand_feature_view
driver_hourly_stats_source = FileSource(
name="driver_hourly_stats_source",
path="data/driver_stats_with_string.parquet",
event_timestamp_column="event_timestamp",
timestamp_field="event_timestamp",
created_timestamp_column="created",
)
driver_entity = Entity(
name="driver_id", value_type=ValueType.INT64, description="Driver ID"
driver = Entity(
# It would be the modern Feast pattern to call this `driver`, but the
# golden tests have the name as `driver_id`
name="driver_id",
join_keys=["driver_id"],
value_type=ValueType.INT64,
description="Driver ID",
)
driver_hourly_stats_view = FeatureView(
name="driver_hourly_stats",
entities=["driver_id"],
entities=[driver],
ttl=timedelta(days=7),
features=[
Feature(
schema=[
Field(
name="conv_rate",
dtype=ValueType.FLOAT,
labels=dict(description="Conv rate"),
dtype=feast.types.Float64,
tags=dict(description="Conv rate"),
),
Feature(
name="acc_rate", dtype=ValueType.FLOAT, labels=dict(description="Acc rate")
Field(
name="acc_rate",
dtype=feast.types.Float64,
tags=dict(description="Acc rate"),
),
Feature(
Field(
name="avg_daily_trips",
dtype=ValueType.INT64,
labels=dict(description="Avg daily trips"),
dtype=feast.types.Int64,
tags=dict(description="Avg daily trips"),
),
Feature(
Field(
name="string_feature",
dtype=ValueType.STRING,
labels=dict(description="String feature"),
dtype=feast.types.String,
tags=dict(description="String feature"),
),
],
online=True,
batch_source=driver_hourly_stats_source,
source=driver_hourly_stats_source,
tags={},
)
input_request = RequestDataSource(
input_request = RequestSource(
name="vals_to_add",
schema={"val_to_add": ValueType.INT64, "val_to_add_2": ValueType.INT64},
schema=[
Field(name="val_to_add", dtype=feast.types.Int64),
Field(name="val_to_add_2", dtype=feast.types.Int64),
],
)
@on_demand_feature_view( # type: ignore
inputs={
"driver_hourly_stats": driver_hourly_stats_view,
"vals_to_add": input_request,
},
features=[
Feature(name="conv_rate_plus_val1", dtype=ValueType.DOUBLE),
Feature(name="conv_rate_plus_val2", dtype=ValueType.DOUBLE),
sources=[
driver_hourly_stats_view,
input_request,
],
schema=[
Field(name="conv_rate_plus_val1", dtype=feast.types.Float64),
Field(name="conv_rate_plus_val2", dtype=feast.types.Float64),
],
)
def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame:

View File

@ -1,4 +1,4 @@
import os
import sys
import pytest
from freezegun import freeze_time
@ -8,11 +8,12 @@ from tests.test_helpers import mce_helpers
FROZEN_TIME = "2020-04-14 07:00:00"
pytestmark = pytest.mark.skipif(
sys.version_info < (3, 8), reason="requires python 3.8 or higher"
)
@freeze_time(FROZEN_TIME)
@pytest.mark.skipif(
os.getenv("AIRFLOW1_TEST") == "true", reason="feast requires Airflow 2.0 or newer"
)
def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time):
test_resources_dir = pytestconfig.rootpath / "tests/integration/feast"
output_path = tmp_path / "feast_repository_mces.json"