mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-26 15:10:05 +00:00
Doc Migration - Move design & fix security details (#6131)
* Move design & fix security details * Add install test * Add python version for M1
This commit is contained in:
parent
6b10adc40f
commit
7e0a7dcc5e
@ -11,11 +11,11 @@ Once the `Client Id` and `Client Secret` are generated add the `Client Id` in `o
|
||||
|
||||
```yaml
|
||||
authenticationConfiguration:
|
||||
provider: "azure"
|
||||
provider: "google"
|
||||
publicKeyUrls:
|
||||
- "https://login.microsoftonline.com/common/discovery/keys"
|
||||
authority: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
clientId: "{Client ID}"
|
||||
- "https://www.googleapis.com/oauth2/v3/certs"
|
||||
authority: "https://accounts.google.com"
|
||||
clientId: "{client id}"
|
||||
callbackUrl: "http://localhost:8585/callback"
|
||||
```
|
||||
|
||||
@ -44,11 +44,9 @@ airflowConfiguration:
|
||||
username: ${AIRFLOW_USERNAME:-admin}
|
||||
password: ${AIRFLOW_PASSWORD:-admin}
|
||||
metadataApiEndpoint: ${SERVER_HOST_API_URL:-http://localhost:8585/api}
|
||||
authProvider: azure
|
||||
authProvider: google
|
||||
authConfig:
|
||||
azure:
|
||||
clientSecret: ${OM_AUTH_AIRFLOW_AZURE_CLIENT_SECRET:-""}
|
||||
authority: ${OM_AUTH_AIRFLOW_AZURE_AUTHORITY_URL:-""}
|
||||
scopes: ${OM_AUTH_AIRFLOW_AZURE_SCOPES:-[]}
|
||||
clientId: ${OM_AUTH_AIRFLOW_AZURE_CLIENT_ID:-""}
|
||||
google:
|
||||
secretKey: ${OM_AUTH_AIRFLOW_GOOGLE_SECRET_KEY_PATH:- ""}
|
||||
audience: ${OM_AUTH_AIRFLOW_GOOGLE_AUDIENCE:-"https://www.googleapis.com/oauth2/v4/token"}
|
||||
```
|
||||
|
||||
@ -20,21 +20,16 @@ AUTHORIZER_ADMIN_PRINCIPALS: [ admin ] # Your `name` from name@domain.com
|
||||
AUTHORIZER_INGESTION_PRINCIPALS: [ ingestion-bot ]
|
||||
AUTHORIZER_PRINCIPAL_DOMAIN: open-metadata.org
|
||||
|
||||
AUTHENTICATION_PROVIDER: azure
|
||||
AUTHENTICATION_PROVIDER: google
|
||||
AUTHENTICATION_PUBLIC_KEYS:
|
||||
- "https://login.microsoftonline.com/common/discovery/keys"
|
||||
AUTHENTICATION_AUTHORITY: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
- "https://www.googleapis.com/oauth2/v3/certs"
|
||||
AUTHENTICATION_AUTHORITY: "https://accounts.google.com"
|
||||
AUTHENTICATION_CLIENT_ID: Client ID
|
||||
AUTHENTICATION_CALLBACK_URL: http://localhost:8585/callback
|
||||
|
||||
# Airflow Configuration
|
||||
AIRFLOW_AUTH_PROVIDER: azure
|
||||
OM_AUTH_AIRFLOW_AZURE_CLIENT_SECRET: Client Secret
|
||||
OM_AUTH_AIRFLOW_AZURE_AUTHORITY_URL: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
OM_AUTH_AIRFLOW_AZURE_SCOPES:
|
||||
- scope 1
|
||||
- scope 2
|
||||
OM_AUTH_AIRFLOW_AZURE_CLIENT_ID: Client Id
|
||||
AIRFLOW_AUTH_PROVIDER: google
|
||||
OM_AUTH_AIRFLOW_GOOGLE_SECRET_KEY_PATH: /path/to/secret.json
|
||||
```
|
||||
|
||||
## 2. Start Docker
|
||||
|
||||
@ -178,9 +178,7 @@ When setting up the YAML config for the connector, update the `workflowConfig` a
|
||||
workflowConfig:
|
||||
openMetadataServerConfig:
|
||||
hostPort: 'http://localhost:8585/api'
|
||||
authProvider: auth0
|
||||
authProvider: google
|
||||
securityConfig:
|
||||
clientId: '{your_client_id}'
|
||||
secretKey: '{your_client_secret}'
|
||||
domain: '{your_domain}'
|
||||
secretKey: '{path-to-json-creds}'
|
||||
```
|
||||
|
||||
@ -22,20 +22,17 @@ global:
|
||||
- "<service_application_client_id>"
|
||||
principalDomain: "open-metadata.org"
|
||||
authentication:
|
||||
provider: "azure"
|
||||
provider: "google"
|
||||
publicKeys:
|
||||
- "https://login.microsoftonline.com/common/discovery/keys"
|
||||
authority: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
clientId: "{Client ID}"
|
||||
- "https://www.googleapis.com/oauth2/v3/certs"
|
||||
authority: "https://accounts.google.com"
|
||||
clientId: "{client id}"
|
||||
callbackUrl: "http://localhost:8585/callback"
|
||||
airflow:
|
||||
openmetadata:
|
||||
authProvider: "azure"
|
||||
azure:
|
||||
clientSecret:
|
||||
secretRef: azure-client-secret
|
||||
secretKey: azure-client-secret
|
||||
authority: ""
|
||||
scopes: [ ]
|
||||
clientId: ""
|
||||
authProvider: "google"
|
||||
google:
|
||||
# absolute path of secret file on airflow instance
|
||||
secretKeyPath: ""
|
||||
audience: "https://www.googleapis.com/oauth2/v4/token"
|
||||
```
|
||||
|
||||
@ -11,11 +11,11 @@ Once the `Client Id` and `Client Secret` are generated add the `Client Id` in `o
|
||||
|
||||
```yaml
|
||||
authenticationConfiguration:
|
||||
provider: "azure"
|
||||
provider: "okta"
|
||||
publicKeyUrls:
|
||||
- "https://login.microsoftonline.com/common/discovery/keys"
|
||||
authority: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
clientId: "{Client ID}"
|
||||
- "{ISSUER_URL}/v1/keys"
|
||||
authority: "{ISSUER_URL}"
|
||||
clientId: "{CLIENT_ID - SPA APP}"
|
||||
callbackUrl: "http://localhost:8585/callback"
|
||||
```
|
||||
|
||||
@ -46,9 +46,10 @@ airflowConfiguration:
|
||||
metadataApiEndpoint: ${SERVER_HOST_API_URL:-http://localhost:8585/api}
|
||||
authProvider: azure
|
||||
authConfig:
|
||||
azure:
|
||||
clientSecret: ${OM_AUTH_AIRFLOW_AZURE_CLIENT_SECRET:-""}
|
||||
authority: ${OM_AUTH_AIRFLOW_AZURE_AUTHORITY_URL:-""}
|
||||
scopes: ${OM_AUTH_AIRFLOW_AZURE_SCOPES:-[]}
|
||||
clientId: ${OM_AUTH_AIRFLOW_AZURE_CLIENT_ID:-""}
|
||||
okta:
|
||||
clientId: ${OM_AUTH_AIRFLOW_OKTA_CLIENT_ID:-""}
|
||||
orgURL: ${OM_AUTH_AIRFLOW_OKTA_ORGANIZATION_URL:-""}
|
||||
privateKey: ${OM_AUTH_AIRFLOW_OKTA_PRIVATE_KEY:-""}
|
||||
email: ${OM_AUTH_AIRFLOW_OKTA_SA_EMAIL:-""}
|
||||
scopes: ${OM_AUTH_AIRFLOW_OKTA_SCOPES:-[]}
|
||||
```
|
||||
|
||||
@ -20,21 +20,20 @@ AUTHORIZER_ADMIN_PRINCIPALS: [ admin ] # Your `name` from name@domain.com
|
||||
AUTHORIZER_INGESTION_PRINCIPALS: [ ingestion-bot ]
|
||||
AUTHORIZER_PRINCIPAL_DOMAIN: open-metadata.org
|
||||
|
||||
AUTHENTICATION_PROVIDER: azure
|
||||
AUTHENTICATION_PROVIDER: okta
|
||||
AUTHENTICATION_PUBLIC_KEYS:
|
||||
- "https://login.microsoftonline.com/common/discovery/keys"
|
||||
AUTHENTICATION_AUTHORITY: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
AUTHENTICATION_CLIENT_ID: Client ID
|
||||
- "{ISSUER_URL}/v1/keys"
|
||||
AUTHENTICATION_AUTHORITY: "{ISSUER_URL}"
|
||||
AUTHENTICATION_CLIENT_ID: "{CLIENT_ID - SPA APP}"
|
||||
AUTHENTICATION_CALLBACK_URL: http://localhost:8585/callback
|
||||
|
||||
# Airflow Configuration
|
||||
AIRFLOW_AUTH_PROVIDER: azure
|
||||
OM_AUTH_AIRFLOW_AZURE_CLIENT_SECRET: Client Secret
|
||||
OM_AUTH_AIRFLOW_AZURE_AUTHORITY_URL: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
OM_AUTH_AIRFLOW_AZURE_SCOPES:
|
||||
- scope 1
|
||||
- scope 2
|
||||
OM_AUTH_AIRFLOW_AZURE_CLIENT_ID: Client Id
|
||||
AIRFLOW_AUTH_PROVIDER: okta
|
||||
OM_AUTH_AIRFLOW_OKTA_CLIENT_ID: ${OM_AUTH_AIRFLOW_OKTA_CLIENT_ID:-""}
|
||||
OM_AUTH_AIRFLOW_OKTA_ORGANIZATION_URL: ${OM_AUTH_AIRFLOW_OKTA_ORGANIZATION_URL:-""}
|
||||
OM_AUTH_AIRFLOW_OKTA_PRIVATE_KEY: ${OM_AUTH_AIRFLOW_OKTA_PRIVATE_KEY:-""}
|
||||
OM_AUTH_AIRFLOW_OKTA_SA_EMAIL: ${OM_AUTH_AIRFLOW_OKTA_SA_EMAIL:-""}
|
||||
OM_AUTH_AIRFLOW_OKTA_SCOPES: ${OM_AUTH_AIRFLOW_OKTA_SCOPES:-[]}
|
||||
```
|
||||
|
||||
## 2. Start Docker
|
||||
|
||||
@ -22,20 +22,21 @@ global:
|
||||
- "<service_application_client_id>"
|
||||
principalDomain: "open-metadata.org"
|
||||
authentication:
|
||||
provider: "azure"
|
||||
provider: "okta"
|
||||
publicKeys:
|
||||
- "https://login.microsoftonline.com/common/discovery/keys"
|
||||
authority: "https://login.microsoftonline.com/{Tenant ID}"
|
||||
clientId: "{Client ID}"
|
||||
- "{ISSUER_URL}/v1/keys"
|
||||
authority: "{ISSUER_URL}"
|
||||
clientId: "{CLIENT_ID - SPA APP}"
|
||||
callbackUrl: "http://localhost:8585/callback"
|
||||
airflow:
|
||||
openmetadata:
|
||||
authProvider: "azure"
|
||||
azure:
|
||||
clientSecret:
|
||||
secretRef: azure-client-secret
|
||||
secretKey: azure-client-secret
|
||||
authority: ""
|
||||
scopes: [ ]
|
||||
authProvider: "okta"
|
||||
okta:
|
||||
clientId: ""
|
||||
orgUrl: ""
|
||||
privateKey:
|
||||
secretRef: okta-client-private-key-secret
|
||||
secretKey: okta-client-private-key-secret
|
||||
email: ""
|
||||
scopes: [ ]
|
||||
```
|
||||
|
||||
@ -36,7 +36,7 @@ OpenMetadata uses the [Dropwizard](https://www.dropwizard.io/) Java framework to
|
||||
|
||||
## System and Components
|
||||
|
||||
<Image src="/images/developers/architecture/system-and-components.png" alt="System and Components" caption="Overview of the OpenMetadata components and high-level interactions."/>
|
||||
<Image src="/images/developers/architecture/architecture.png" alt="System and Components" caption="Overview of the OpenMetadata components and high-level interactions."/>
|
||||
|
||||
### Events
|
||||
OpenMetadata captures changes to entities as `events` and stores them in the OpenMetadata server database. OpenMetadata also indexes change events in Elasticsearch to make them searchable.
|
||||
|
||||
@ -1,215 +0,0 @@
|
||||
---
|
||||
title: Design
|
||||
slug: /developers/architecture/design
|
||||
---
|
||||
|
||||
# Design
|
||||
The Solution Design will help us explore and understand the internals of OpenMetadata services, how are they built and their interactions.
|
||||
|
||||
We will start by describing the big picture of the software design of the application. Bit by bit we will get inside specific components, describing their behaviour and showing examples on how to use them.
|
||||
|
||||
## System Context
|
||||
The goal of this first section is to get familiar with the high-level concepts and technologies involved. The learning objectives here are:
|
||||
|
||||
- Describe the elements that compose OpenMetadata and their relationships.
|
||||
- How end-users and external applications can communicate with the system.
|
||||
|
||||
Here we have the main actors of the solution:
|
||||
|
||||
<Image src="/images/developers/architecture/system-context-diagram.png" alt="OpenMetadata architecture" caption="System Context Diagram"/>
|
||||
|
||||
- **API**: This is the main pillar of OpenMetadata. Here we have defined how we can interact with the metadata **Entities**. It powers all the other components of the solution.
|
||||
- **UI**: Discovery-focused tool that helps users keep track of all the data assets in the organisation. Its goal is enabling and fueling **collaboration**.
|
||||
- **Ingestion Framework**: Based on the API specifications, this system is the foundation of all the **Connectors**, i.e., the components that define the interaction between OpenMetadata and external systems containing the metadata we want to integrate.
|
||||
- **Entity Store**: MySQL storage that contains real-time information on the state of all the **Entities** and their **Relationships**.
|
||||
- **Search Engine**: Powered by ElasticSearch, it is the indexing system for the UI to help users **discover** the metadata.
|
||||
|
||||
## JSON Schemas
|
||||
If we abstract away from the Storage Layer for a moment, we then realize that the OpenMetadata implementation is the integration of three blocks:
|
||||
|
||||
1. The core **API**, unifying and centralising the communication with internal and external systems.
|
||||
2. The **UI** for a team-centric metadata Serving Layer.
|
||||
3. The **Ingestion Framework** as an Interface between OpenMetadata and external sources.
|
||||
|
||||
The only thing these components have in common is the **vocabulary** -> All of them are shaping, describing, and moving around metadata **Entities**.
|
||||
|
||||
OpenMetadata is based on a **standard definition** for metadata. Therefore, we need to make sure that in our implementation of this standard we share this definition in the end-to-end workflow. To this end, the main lexicon is defined as JSON Schemas, a readable and language-agnostic solution.
|
||||
|
||||
Then, when packaging the main components, we generate the specific programming classes for all the Entities. What we achieve is three views from the same source:
|
||||
|
||||
- Java Classes for the API,
|
||||
- Python Classes for the Ingestion Framework and
|
||||
- Javascript Classes for the UI,
|
||||
|
||||
each of them modelled after a single source of truth. Thanks to this approach we can be sure that it does not matter at which point we zoom in throughout the whole process, we are always going to find a univocal well-defined Entity.
|
||||
|
||||
## API Container Diagram
|
||||
Now we are going to zoom inside the API Container. As the central Software System of the solution, its goal is to **manage calls** (both from internal and external sources, e.g., Ingestion Framework or any custom integration) and update the **state** of the metadata Entities.
|
||||
|
||||
While the data is stored in the MySQL container, the API will be the one fetching it and completing the necessary information, validating the Entities data and all the relationships.
|
||||
|
||||
Having a **Serving Layer** (API) decoupled from the Storage Layer allows users and integrations to ask for what they need in a simple language (REST), without the learning curve of diving into specific data models and design choices.
|
||||
|
||||
<Image src="/images/developers/architecture/api-container-diagram.png" alt="OpenMetadata architecture" caption="API Container Diagram"/>
|
||||
|
||||
### Entity Resource
|
||||
When we interact with most of our Entities, we follow the same endpoint structure. For example:
|
||||
|
||||
- `GET <url>/api/v1/<collectionName>/<id>` to retrieve an Entity instance by ID, or
|
||||
- `GET <url>/api/v1/<collectionName>/name/<FQDN>` to query by its fully qualified domain name.
|
||||
|
||||
Similarly, we support other CRUD operations, each of them expecting a specific incoming data structure, and returning the Entity's class. As the foundations of OpenMetadata are the Entities definitions, we have this data contract with any consumer, where the backend will validate the received data, as well as the outputs.
|
||||
|
||||
The endpoint definition and datatype setting are what happens at the **Entity Resource**. Each metadata Entity is packed with a Resource class, which builds the API definition for the given Entity.
|
||||
|
||||
This logic is what then surfaces in the API [docs](https://sandbox.open-metadata.org/docs).
|
||||
|
||||
### Entity Repository
|
||||
The goal of the Entity Repository is to perform Read & Write operations to the **backend database** to Create, Retrieve, Update and Delete Entities.
|
||||
|
||||
While the Entity Resource handles external communication, the Repository is in charge of managing how the whole process interacts with the Storage Layer, making sure that incoming and outcoming Entities are valid and hold proper and complete information.
|
||||
|
||||
This means that here is where we define our **DAO (Data Access Object)**, with all the validation and data storage logic.
|
||||
|
||||
As there are processes repeated across all Entities (e.g., listing entities in a collection or getting a specific version from an Entity), the Entity Repository extends an **Interface** that implements some basic functionalities and abstracts Entity specific logic.
|
||||
|
||||
Each Entity then needs to implement its **server-side processes** such as building the FQDN based on the Entity hierarchy, how the Entity stores and retrieves **Relationship** information with other Entities or how the Entity reacts to **Change Events**.
|
||||
|
||||
## Entity Storage Layer
|
||||
In the _API Container Diagram_, we showed how the Entity Repository interacts with three different Storage Containers (tables) depending on what type of information is being processed.
|
||||
|
||||
To fully understand this decision, we should first talk about the information contained by Entities instances.
|
||||
|
||||
An Entity has two types of fields: **attributes** (JSON Schema `properties`) and **relationships** (JSON Schema `href`):
|
||||
|
||||
- **Attributes** are the core properties of the Entity: the name and id, the columns for a table, or the algorithm for an ML Model. Those are **intrinsic** pieces of information of an Entity and their existence and values are what help us differentiate both Entity instances (Table A vs. Table B) and Entity definitions (Dashboard vs. Topic).
|
||||
- **Relationships** are associations between two Entities. For example, a Table belongs to a Database, a User owns a Dashboard, etc. Relationships are a special type of attribute that is captured using **Entity References**.
|
||||
|
||||
### Entity and Relationship Store
|
||||
Entities are stored as JSON documents in the database. Each entity has an associated table (`<entityName>_entity`) which contains the JSON defining the Entity **attributes** and other metadata fields, such as the id, `updatedAt` or `updatedBy`.
|
||||
|
||||
This JSON does not store any Relationship. E.g., a User owning a Dashboard is a piece of information that is materialised in a separate table `entity_relationship` as graph nodes, where the edge holds the type of the Relationship (e.g., `contains`, `uses`, `follows`...).
|
||||
|
||||
This separation helps us decouple concerns. We can process related entities independently and validate at runtime what information needs to be updated and/or retrieved. For example, if we delete a Dashboard being owned by a User, we will then clean up this row in `entity_relationship`, but that won't alter the information from the User.
|
||||
|
||||
Another trickier example would be trying to delete a Database that contains Tables. In this case, the process would check that the Database Entity is not empty, and therefore we cannot continue with the removal.
|
||||
|
||||
### Change Events Store
|
||||
You might have already noticed that in all Entities definitions we have a `changeDescription` field. It is defined as _"Change that leads to this version of the entity"_. If we inspect further the properties of `changeDescription`, we can see how it stores the differences between the current and last versions of an Entity.
|
||||
|
||||
This results in giving visibility on the last update step of each Entity instance. However, there might be times when this level of tracking is not enough.
|
||||
|
||||
One of the greatest features of OpenMetadata is the ability to track all Entity versions. Each operation that leads to a change (`PUT`, `POST`, `PATCH`) will generate a trace that is going to be stored in the table `change_event`.
|
||||
|
||||
Using the API to get events data, or directly exploring the different versions of each entity gives great debugging power to both data consumers and producers.
|
||||
|
||||
## API Component Diagram
|
||||
Now that we have a clear picture of the main pieces and their roles, we will analyze the logical flow of a `POST` and a `PUT` calls to the API. The main goal of this section is to get familiar with the code organisation and its main steps.
|
||||
|
||||
<Note>
|
||||
|
||||
To take the most out of this section, it is recommended to follow the source code as well, from the Entity JSON you'd like to use as an example to its implementation of `Resource` and `Repository`.
|
||||
</Note>
|
||||
|
||||
### Create a new Entity - POST
|
||||
We will start with the simplest scenario: Creating a new Entity via a `POST` call. This is a great first point to review as part of the logic and methods are reused during updates.
|
||||
|
||||
<Image src="/images/developers/architecture/post-call-api.png" alt="OpenMetadata architecture" caption="Component Diagram of a POST call to the API"/>
|
||||
|
||||
**Create**
|
||||
|
||||
As we already know, the recipient of the HTTP call will be the `EntityResource`. In there, we have the `create` function with the `@POST` **annotation** and the description of the API endpoint and expected schemas.
|
||||
|
||||
The role of this first component is to receive the call and validate the request body and headers, but the real implementation happens in the `EntityRepository`, which we already described as the **DAO**.
|
||||
|
||||
For the `POST` operation, the internal flow is rather simple and is composed of two steps:
|
||||
|
||||
1. **Prepare**: Which validates the Entity data and computes some attributes at the server-side.
|
||||
2. **Store**: This saves the Entity JSON and its Relationships to the backend DB.
|
||||
|
||||
**Prepare**
|
||||
|
||||
This method is used for **validating** an entity to be created during `POST`, `PUT`, and `PATCH` operations and **preparing** the entity with all the required attributes and relationships.
|
||||
|
||||
Here we handle, for example, the process of setting up the FQDN of an Entity based on its hierarchy. While all Entities require an FQDN, this is not an attribute we expect to receive in a request.
|
||||
|
||||
Moreover, this checks that the received attributes are being correctly informed, e.g., we have a valid `User` as an `owner` or a valid `Database` for a `Table`.
|
||||
|
||||
**Store**
|
||||
|
||||
The storing process is divided into two different steps (as we have two tables holding the information).
|
||||
|
||||
1. We strip the validated Entity from any href attribute (such as owner or tags) in order to just store a JSON document with the Entity intrinsic values.
|
||||
2. We then store the graph representation of the Relationships for the attributes omitted above.
|
||||
|
||||
At the end of these calls, we end up with a validated Entity holding all the required attributes, which have been validated and stored accordingly. We can then return the created Entity to the caller.
|
||||
|
||||
### Create or Update an Entity - PUT
|
||||
Let's now build on top of what we learned during the `POST` discussion, expanding the example to a `PUT` request handling.
|
||||
|
||||
<Image src="/images/developers/architecture/put-call-api.png" alt="OpenMetadata architecture" caption="Component Diagram of a PUT call to the API"/>
|
||||
|
||||
The first steps are fairly similar:
|
||||
|
||||
1. We have a function in our `Resource` annotated as `@PUT` and handling headers, auth and schemas.
|
||||
2. The Resource then calls the DAO at the `Repository`, bootstrapping the data-related logic.
|
||||
3. We validate the Entity and cook some attributes during the `prepare` step.
|
||||
|
||||
After processing and validating the Entity request, we then check if the Entity instance has already been stored, querying the backend database by its FQDN. If it has not, then we proceed with the same logic as the `POST` operation -> simple creation. Otherwise, we need to validate the updated fields.
|
||||
|
||||
**Set Fields**
|
||||
|
||||
We cannot allow all fields to be updated for a given Entity instance. For example, the `id` or `name` stay immutable once the instance is created, and the same thing happens to the `Database` of a `Table`.
|
||||
|
||||
The list of specified fields that can change is defined at each Entity's `Repository`, and we should only allow changes on those attributes that can naturally evolve throughout the **lifecycle** of the object.
|
||||
|
||||
At this step, we set the fields to the Entity that are either required by the JSON schema definition (e.g., the `algorithm` for an `MlModel`) or, in the case of a `GET` operation, that are requested as `GET <url>/api/v1/<collectionName>/<id>?fields=field1,field2...`
|
||||
|
||||
**Update**
|
||||
|
||||
In the `EntityRepository` there is an abstract implementation of the `EntityUpdater` interface, which is in charge of defining the generic update logic flow common for all the Entities.
|
||||
|
||||
The main steps handled in the `update` calls are:
|
||||
|
||||
1. Update the Entity **generic** fields, such as the description or the owner.
|
||||
2. Run Entity **specific** updates, which are implemented by each Entity's `EntityUpdater` extension.
|
||||
3. **Store** the updated Entity JSON doc to the Entity Table in MySQL.
|
||||
|
||||
**Entity Specific Updates**
|
||||
|
||||
Each Entity has a set of attributes that define it. These attributes are going to have a very specific behaviour, so the implementation of the `update` logic falls to each Entity `Repository`.
|
||||
|
||||
For example, we can update the `Columns` of a `Table`, or the Dashboard holding the performance metrics of an `MlModel`. Both of these changes are going to be treated differently, in terms of how the Entity performs internally the update, how the Entity **version** gets affected, or the impact on the **Relationship** data.
|
||||
|
||||
For the sake of discussion, we'll follow a couple of `update` scenarios.
|
||||
|
||||
**Example 1 - Updating Columns of a Table**
|
||||
|
||||
When updating `Columns`, we need to compare the existing set of columns in the original Entity vs. the incoming columns of the `PUT` request.
|
||||
|
||||
If we are receiving an existing column, we might need to update its `description` or `tags`. This change will be considered a **minor** change. Therefore, the version of the Entity will be bumped by `0.1`, following the software release specification model.
|
||||
|
||||
However, what happens if a stored column is not received in the updated instance? That would mean that such a column has been deleted. This is a type of change that could possibly break integrations on top of the Table's data. Therefore, we can mark this scenario as a **major** update. In this case, the version of the Entity will increase by `1.0`.
|
||||
|
||||
Checking the Change Events or visiting the Entity history will easily show us the evolution of an Entity instance, which will be immensely valuable when debugging data issues.
|
||||
|
||||
**Example 2 - Updating the Dashboard of an ML Model**
|
||||
|
||||
One of the attributes for an MlModel is the `EntityReference` to a `Dashboard` holding its performance metrics evolution.
|
||||
|
||||
As this attribute is a reference to another existing Entity, this data is not directly stored in the `MlModel` JSON doc, but rather as a Relationship graph, as we have been discussing previously. Therefore, during the `update` step we will need to:
|
||||
|
||||
1. Insert the relationship, if the original Entity had no Dashboard informed,
|
||||
2. Delete the relationship if the Dashboard has been removed, or
|
||||
3. Update the relationship if we now point to a different Dashboard.
|
||||
|
||||
Note how during the `POST` operation we needed to always call the `storeRelationship` function, as it was the first time we were storing the instance's information. During an update, we will just modify the Relationship data if the Entity's specific attributes require it.
|
||||
|
||||
### Handling Events
|
||||
During all these discussions and examples we've been showing how the backend API handles HTTP requests and what the Entities' data lifecycle is. Not only we've been focusing on the JSON docs and Relationships, but from time to time we have talked about Change Events.
|
||||
|
||||
Moreover, In the API Container Diagram **** we drew a Container representing the Table holding the Change Event data, but yet, we have not found any Component accessing it.
|
||||
|
||||
This is because the API server is powered by **Jetty**, which means that luckily we do not need to make those calls ourselves! By defining a `ChangeEventHandler` and registering it during the creation of the server, this postprocessing of the calls happens transparently.
|
||||
|
||||
Our `ChangeEventHandler` will check if the Entity has been Created, Updated or Deleted and will store the appropriate `ChangeEvent` data from our response to the backend DB.
|
||||
@ -16,6 +16,6 @@ OpenMetadata depends on following components to build a metadata platform:
|
||||
|
||||
<Image src="/images/developers/architecture/architecture.png" alt="OpenMetadata architecture" caption=" "/>
|
||||
|
||||
To understand the OpenMetadata Architecture and how everything fits together please go through [Design page](/developers/architecture/design).
|
||||
To understand the OpenMetadata Architecture and how everything fits together please go through [Design page](/main-concepts/high-level-design).
|
||||
|
||||
For Schema design and how our API works here is an example of ML [Model entity page](/sdk/python/entities/ml-model)
|
||||
@ -12,7 +12,14 @@ The Ingestion Framework is a Python module that wraps the OpenMetadata API and b
|
||||
To do so, you can either build and run the [OpenMetadata Server](/developers/contribute/build-code-and-run-tests/openmetadata-server) locally as well, or use the `metadata` CLI to spin up the Docker containers as explained in the [Tooling Status](/developers/contribute/build-code-and-run-tests/ingestion-framework#tooling-status) section of this page.
|
||||
|
||||
## Python Setup
|
||||
We recommend using pyenv to properly install and manage different Python versions in your system. Note that OpenMetadata requires Python version +3.8. This [doc](https://python-docs.readthedocs.io/en/latest/dev/virtualenvs.html) might be helpful to set up the environment virtualization.
|
||||
We recommend using pyenv to properly install and manage different Python versions in your system. Note that OpenMetadata requires Python version +3.7. This [doc](https://python-docs.readthedocs.io/en/latest/dev/virtualenvs.html) might be helpful to set up the environment virtualization.
|
||||
|
||||
<Note>
|
||||
|
||||
Using an M1 Mac? To ensure compatibility with the different dependencies, use Python version 3.9.8 or higher. Note that the code
|
||||
has not been tested with Python 3.10 due to some libraries not supporting that already.
|
||||
|
||||
</Note>
|
||||
|
||||
### Generated Sources
|
||||
The backbone of OpenMetadata is the series of JSON schemas defining the Entities and their properties.
|
||||
@ -56,7 +63,7 @@ At least once, build the images fully:
|
||||
sh docker/run_local_docker.sh
|
||||
```
|
||||
|
||||
Which will package the server code. Afterward, if the modifications only impact the ingestion code and you need to refresh the ingestion container, you can run:
|
||||
Which will package the server code. Afterward, if the modifications only impact the ingestion code and need to refresh the ingestion container, you can run:
|
||||
|
||||
```shell
|
||||
cd docker/local-metadata
|
||||
@ -64,4 +71,8 @@ docker compose down -v && docker compose up --build
|
||||
```
|
||||
|
||||
## Running tests
|
||||
You can validate the environment by running make coverage from the root directory. Note that from some of the tests, having the OpenMetadata server instance up is required as they interact with the API.
|
||||
|
||||
To run the tests, you'll to install some packages via `make install_test`.
|
||||
|
||||
You can validate the environment by running `make coverage` from the root directory. Note that from some tests, having
|
||||
the OpenMetadata server instance up is required as they interact with the API.
|
||||
|
||||
@ -863,8 +863,6 @@ site_menu:
|
||||
icon: developer_mode
|
||||
- category: Developers / Architecture
|
||||
url: /developers/architecture
|
||||
- category: Developers / Architecture / Design
|
||||
url: /developers/architecture/design
|
||||
- category: Developers / Architecture / Understand Code Layout
|
||||
url: /developers/architecture/code-layout
|
||||
- category: Developers / Open Source Community
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 134 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 132 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 172 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 90 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 114 KiB |
Loading…
x
Reference in New Issue
Block a user