mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-04 15:50:14 +00:00
Compare commits
875 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
308c6bd28e | ||
![]() |
2388d770dd | ||
![]() |
416c2093d1 | ||
![]() |
ae234d671e | ||
![]() |
9d79914295 | ||
![]() |
ef3446f066 | ||
![]() |
8352617375 | ||
![]() |
a7c5895d98 | ||
![]() |
1561a6c8ca | ||
![]() |
eaf2bf6dec | ||
![]() |
69ac3147d4 | ||
![]() |
b64bf9bd36 | ||
![]() |
e7463ac1f4 | ||
![]() |
5292a268c9 | ||
![]() |
1c72cb3a79 | ||
![]() |
9b26c8d598 | ||
![]() |
fa7b4132d2 | ||
![]() |
1345e63977 | ||
![]() |
661a4ae9f1 | ||
![]() |
87c8378b12 | ||
![]() |
77a5671256 | ||
![]() |
cfadc80843 | ||
![]() |
68130a33e4 | ||
![]() |
0a82d4a5f9 | ||
![]() |
7b7038d8d3 | ||
![]() |
fedbfa3f7e | ||
![]() |
ecc24da0fa | ||
![]() |
f7aa9ba1c2 | ||
![]() |
242fc1e50d | ||
![]() |
3867c1cbce | ||
![]() |
7457319c0a | ||
![]() |
1c34f96b6d | ||
![]() |
aa6e658cda | ||
![]() |
70a39b70f2 | ||
![]() |
ee9a3ea0a8 | ||
![]() |
bc03e9452d | ||
![]() |
92784ec3a4 | ||
![]() |
6b1817902d | ||
![]() |
fd2f0b15d1 | ||
![]() |
1dab349517 | ||
![]() |
96bb33bed6 | ||
![]() |
34b340e3b9 | ||
![]() |
2d2c3754d9 | ||
![]() |
85e6b7eb6f | ||
![]() |
3583aa1eff | ||
![]() |
31ee414008 | ||
![]() |
73cb3621e0 | ||
![]() |
bcfa3e08b5 | ||
![]() |
aa186c2887 | ||
![]() |
f13fb828d7 | ||
![]() |
e5ef9a0f3a | ||
![]() |
aeef5d6870 | ||
![]() |
d567c5d4bb | ||
![]() |
bd9a3f5e2a | ||
![]() |
e8b5a60d1d | ||
![]() |
7345af898d | ||
![]() |
03309b7ffa | ||
![]() |
856da011c8 | ||
![]() |
55cfb95cad | ||
![]() |
05d029d690 | ||
![]() |
ddb4e17772 | ||
![]() |
b162d6f365 | ||
![]() |
f3c8bf9cb4 | ||
![]() |
54db272c4d | ||
![]() |
da54ea6fbd | ||
![]() |
e1477ee48b | ||
![]() |
8c1aaaf02f | ||
![]() |
d05791a4e6 | ||
![]() |
c19b91d9a6 | ||
![]() |
5759711992 | ||
![]() |
aac98c7d12 | ||
![]() |
827a2308cd | ||
![]() |
a3688f78e7 | ||
![]() |
6a74f8f12f | ||
![]() |
820a449b2a | ||
![]() |
7ed7dd25c5 | ||
![]() |
52e49eb79b | ||
![]() |
933e75f5e1 | ||
![]() |
9a32dd7f7f | ||
![]() |
02fa0ac77f | ||
![]() |
b28a65bdf2 | ||
![]() |
79cdf78339 | ||
![]() |
db0873058d | ||
![]() |
0f0119f219 | ||
![]() |
677182daf7 | ||
![]() |
83b9eca358 | ||
![]() |
468e62b8cc | ||
![]() |
a9e9ac9808 | ||
![]() |
3b44ed847c | ||
![]() |
e9f208f514 | ||
![]() |
40452f7c54 | ||
![]() |
d58f41c78b | ||
![]() |
f554679be4 | ||
![]() |
9fe319bc4d | ||
![]() |
6c067cd997 | ||
![]() |
19440271df | ||
![]() |
e4c5cbdbba | ||
![]() |
9bfcdb2a3d | ||
![]() |
a17dbf4849 | ||
![]() |
582164401e | ||
![]() |
d08a293225 | ||
![]() |
1964bcf4d1 | ||
![]() |
a8c5202993 | ||
![]() |
32039dc6a1 | ||
![]() |
e9103f1851 | ||
![]() |
889441217b | ||
![]() |
8bc136d350 | ||
![]() |
f8c6db07d8 | ||
![]() |
b3a25d6fbd | ||
![]() |
dbcbca9a38 | ||
![]() |
614e627720 | ||
![]() |
135e905e7c | ||
![]() |
4aa3a928c0 | ||
![]() |
85b29c9361 | ||
![]() |
b12b9aa919 | ||
![]() |
d4123c8fa9 | ||
![]() |
bb1a593e2a | ||
![]() |
ee7adcc1b4 | ||
![]() |
70a6135b6b | ||
![]() |
f4f7b7ade3 | ||
![]() |
e827fdfebc | ||
![]() |
efa8d7dc27 | ||
![]() |
f67481fcc8 | ||
![]() |
49f3a1e24c | ||
![]() |
d3c2a36ded | ||
![]() |
edd3324553 | ||
![]() |
0c24b94612 | ||
![]() |
eb9de90ecc | ||
![]() |
2651bb1e76 | ||
![]() |
cbd97186e2 | ||
![]() |
4924602773 | ||
![]() |
6ec7b9292c | ||
![]() |
e03149ba03 | ||
![]() |
bf833c0fc7 | ||
![]() |
65a8206605 | ||
![]() |
9add18eff5 | ||
![]() |
3c94cbc0b4 | ||
![]() |
c39e150e29 | ||
![]() |
5404ee9b39 | ||
![]() |
faae9a6b1f | ||
![]() |
b27bd43fbb | ||
![]() |
e08b997797 | ||
![]() |
28d58e8973 | ||
![]() |
47eee11257 | ||
![]() |
e24ef15e77 | ||
![]() |
f3c49c3174 | ||
![]() |
782b3e531a | ||
![]() |
b90837ded9 | ||
![]() |
c879836ea6 | ||
![]() |
a53e62f701 | ||
![]() |
bc1c9bdaec | ||
![]() |
6bf91d1563 | ||
![]() |
d423c9bb59 | ||
![]() |
45e5ef84be | ||
![]() |
081edb11dd | ||
![]() |
606439cafe | ||
![]() |
cb0f7ec801 | ||
![]() |
2376f56934 | ||
![]() |
c91ebf7e9d | ||
![]() |
425d4004b1 | ||
![]() |
476ac881a3 | ||
![]() |
6b8dfb0aa8 | ||
![]() |
3f81f90c73 | ||
![]() |
7acc41ba63 | ||
![]() |
53c0b11c26 | ||
![]() |
f62a58c080 | ||
![]() |
794a2ab317 | ||
![]() |
9d8033ef58 | ||
![]() |
eba14287bb | ||
![]() |
78cfc49703 | ||
![]() |
c58a49886e | ||
![]() |
0fa88189c8 | ||
![]() |
523e16d8b7 | ||
![]() |
f632f970a4 | ||
![]() |
72b1dd5053 | ||
![]() |
4f5e9c7508 | ||
![]() |
4020494bc9 | ||
![]() |
16a3211a96 | ||
![]() |
8889181b31 | ||
![]() |
fa55e5e39a | ||
![]() |
1dc3a19145 | ||
![]() |
c997668110 | ||
![]() |
c913aa4161 | ||
![]() |
4574768b3a | ||
![]() |
7e72dca891 | ||
![]() |
83581dc14f | ||
![]() |
b3f4d5f135 | ||
![]() |
289abb6463 | ||
![]() |
ab5b08e725 | ||
![]() |
79e8da01e2 | ||
![]() |
bf57943f3d | ||
![]() |
11cd28ebc8 | ||
![]() |
92af5546b6 | ||
![]() |
0eef7a02c7 | ||
![]() |
e82cc6672b | ||
![]() |
e169b4ac05 | ||
![]() |
01357940b1 | ||
![]() |
81a510aff6 | ||
![]() |
0e2a79665a | ||
![]() |
57ffbc17e5 | ||
![]() |
67844f1a53 | ||
![]() |
83881e4129 | ||
![]() |
488f011fa0 | ||
![]() |
4114392647 | ||
![]() |
454f76f2b5 | ||
![]() |
799a9f2e31 | ||
![]() |
235a773eea | ||
![]() |
995c1ed730 | ||
![]() |
80ad1cc7a3 | ||
![]() |
93aa64200a | ||
![]() |
65ae48e774 | ||
![]() |
612f68eced | ||
![]() |
13df355675 | ||
![]() |
082cc9e8d6 | ||
![]() |
f1f6fd9fa9 | ||
![]() |
b6ae134a56 | ||
![]() |
07e1c2d6b3 | ||
![]() |
5311434c70 | ||
![]() |
7d5519f2e1 | ||
![]() |
5041677082 | ||
![]() |
ee5d1e5260 | ||
![]() |
487d52ad6e | ||
![]() |
1194f910bd | ||
![]() |
2620af8e2a | ||
![]() |
313997cc76 | ||
![]() |
e050a10922 | ||
![]() |
d2f9db2d9d | ||
![]() |
de20d4ec20 | ||
![]() |
bf174e47e7 | ||
![]() |
4f41832ca8 | ||
![]() |
85bd9cf21a | ||
![]() |
4145ff3d36 | ||
![]() |
ad84b0575c | ||
![]() |
acb7210040 | ||
![]() |
075c57bb60 | ||
![]() |
906eba2b3e | ||
![]() |
528d705544 | ||
![]() |
14fc4f0343 | ||
![]() |
a142a9e2d2 | ||
![]() |
f335093607 | ||
![]() |
7b21a5cea3 | ||
![]() |
4976656df8 | ||
![]() |
7cb26edb5b | ||
![]() |
8787892cbb | ||
![]() |
fea6e6ac4f | ||
![]() |
f2ca93d275 | ||
![]() |
492b55322f | ||
![]() |
747e42497e | ||
![]() |
3de438692d | ||
![]() |
647fb792de | ||
![]() |
ec9719c801 | ||
![]() |
d8d1de431f | ||
![]() |
2b2f6e7d82 | ||
![]() |
79ff05abcd | ||
![]() |
650cff172d | ||
![]() |
b956132e3c | ||
![]() |
64f315eb64 | ||
![]() |
2ffa84be5c | ||
![]() |
4c6672213c | ||
![]() |
09facfdfc5 | ||
![]() |
da3f82ea58 | ||
![]() |
62ab7d817f | ||
![]() |
e1de763c17 | ||
![]() |
947761f875 | ||
![]() |
76a9fc0d7f | ||
![]() |
7bd7a33d0b | ||
![]() |
69f368b00e | ||
![]() |
d234f5580a | ||
![]() |
c5d83683ed | ||
![]() |
9a0a9ac19b | ||
![]() |
d2578b7d34 | ||
![]() |
550a801196 | ||
![]() |
dda24dd169 | ||
![]() |
d473b3f580 | ||
![]() |
76e952f4ac | ||
![]() |
a02d31c604 | ||
![]() |
7e377592b4 | ||
![]() |
3a81adb94a | ||
![]() |
822be781cd | ||
![]() |
7e60587dec | ||
![]() |
7fb9140d41 | ||
![]() |
9fca1737ff | ||
![]() |
f4a8d9e7fc | ||
![]() |
d8099da973 | ||
![]() |
47509ce85b | ||
![]() |
e9df5401cb | ||
![]() |
e37b3c3394 | ||
![]() |
3613e7e0d7 | ||
![]() |
f8ad7de412 | ||
![]() |
489db812b5 | ||
![]() |
28b052c367 | ||
![]() |
3ee25c465c | ||
![]() |
5eca21dfbe | ||
![]() |
a0787c3abe | ||
![]() |
52e01bd599 | ||
![]() |
0eca4dfde2 | ||
![]() |
1dec8d8ccb | ||
![]() |
9584006a72 | ||
![]() |
8cae980286 | ||
![]() |
132ff7081f | ||
![]() |
d3944ded93 | ||
![]() |
0f227a364a | ||
![]() |
8f52bdc5e4 | ||
![]() |
7e6b853a6d | ||
![]() |
8e21ae3211 | ||
![]() |
c9e6831f08 | ||
![]() |
0817041232 | ||
![]() |
064e3618f2 | ||
![]() |
53c25adc9b | ||
![]() |
c6ffab1fe3 | ||
![]() |
13dbfd453f | ||
![]() |
e47606d21c | ||
![]() |
9cee32c963 | ||
![]() |
37182d9e25 | ||
![]() |
401214fa5d | ||
![]() |
199144167a | ||
![]() |
1c7836dce8 | ||
![]() |
a00e65cd2f | ||
![]() |
44efca3961 | ||
![]() |
b4cc77cfaf | ||
![]() |
18d1a3c2e3 | ||
![]() |
7e5295ac3c | ||
![]() |
978d7b1afa | ||
![]() |
2b5ba356e5 | ||
![]() |
8d2ce281df | ||
![]() |
3de036ed0b | ||
![]() |
6e499d3731 | ||
![]() |
56618a5e96 | ||
![]() |
9a892c6eca | ||
![]() |
61ea244659 | ||
![]() |
8b4217f7fa | ||
![]() |
5749f6f970 | ||
![]() |
c756af31b1 | ||
![]() |
1ac41e95d6 | ||
![]() |
effc5e2f4d | ||
![]() |
2a8c7b2aa0 | ||
![]() |
dd377b33dc | ||
![]() |
bc860181d8 | ||
![]() |
184fb09fc0 | ||
![]() |
a03b35e167 | ||
![]() |
7c791db087 | ||
![]() |
dbcab5e404 | ||
![]() |
93c04ad5b4 | ||
![]() |
aeda8f4c95 | ||
![]() |
a97af36e93 | ||
![]() |
6bcec4ea15 | ||
![]() |
dd1b056d35 | ||
![]() |
01b5cdb541 | ||
![]() |
bcedd5d372 | ||
![]() |
ff3f36a291 | ||
![]() |
53af032b6f | ||
![]() |
fb7bcbaf17 | ||
![]() |
7264743920 | ||
![]() |
f094e50ba2 | ||
![]() |
f71ff7722a | ||
![]() |
60f79153ac | ||
![]() |
25991a5ec1 | ||
![]() |
c94ca1e094 | ||
![]() |
4c77c71315 | ||
![]() |
ac3ad8bb36 | ||
![]() |
601d3e6010 | ||
![]() |
62363016bd | ||
![]() |
2b613e139b | ||
![]() |
f6e2f296a0 | ||
![]() |
5aabf650fc | ||
![]() |
a414bbb798 | ||
![]() |
71e104068e | ||
![]() |
79dac4ed45 | ||
![]() |
e0f57b5ef0 | ||
![]() |
cb9796e047 | ||
![]() |
e97488de18 | ||
![]() |
926bb3ceba | ||
![]() |
81c100c0fc | ||
![]() |
79a1ac22c0 | ||
![]() |
2be8c07b74 | ||
![]() |
826caa7935 | ||
![]() |
f7ea0b9d5d | ||
![]() |
5cd115fc7c | ||
![]() |
2679e75071 | ||
![]() |
b48774c1ec | ||
![]() |
87199ff9d7 | ||
![]() |
035af5bdf9 | ||
![]() |
c3852dada5 | ||
![]() |
ed86feec5d | ||
![]() |
7a4232d105 | ||
![]() |
a2b1667d9d | ||
![]() |
2f44cc74a9 | ||
![]() |
a002793e63 | ||
![]() |
287f373a9c | ||
![]() |
2dfe84afe3 | ||
![]() |
e9b867ddd8 | ||
![]() |
131de0f026 | ||
![]() |
294ad23500 | ||
![]() |
65d1c2b43c | ||
![]() |
75e3d29231 | ||
![]() |
eb41e0bbdd | ||
![]() |
2aa11a6bf6 | ||
![]() |
d8739c4e3e | ||
![]() |
964a4b70c2 | ||
![]() |
ca7853c5a4 | ||
![]() |
cb3988a5f3 | ||
![]() |
1b3173ace3 | ||
![]() |
eefdded9f0 | ||
![]() |
2e3328fce0 | ||
![]() |
4e7bb3998d | ||
![]() |
214d376a96 | ||
![]() |
f83460255a | ||
![]() |
24f9bc0f18 | ||
![]() |
096e6d9af4 | ||
![]() |
e2844b6c95 | ||
![]() |
b7ef234bc7 | ||
![]() |
854ec614b9 | ||
![]() |
96c92fda71 | ||
![]() |
e0e41b33e7 | ||
![]() |
03531520ce | ||
![]() |
42aeed074f | ||
![]() |
7f681ee339 | ||
![]() |
6f1968fbd1 | ||
![]() |
e6babc3b81 | ||
![]() |
bf598aed96 | ||
![]() |
9b6960ac8e | ||
![]() |
8919154f02 | ||
![]() |
8f02b90c57 | ||
![]() |
143e2b7ae3 | ||
![]() |
bb9838d789 | ||
![]() |
3e54a842ef | ||
![]() |
95531a3944 | ||
![]() |
f6d962ff96 | ||
![]() |
bdf9e195ef | ||
![]() |
591b6ce0c9 | ||
![]() |
d25d318233 | ||
![]() |
34e74d826e | ||
![]() |
7ae750e69f | ||
![]() |
3154289dd8 | ||
![]() |
afa9209f5d | ||
![]() |
7f3bbe1d33 | ||
![]() |
5f14e1d8ee | ||
![]() |
8e6d78403f | ||
![]() |
ae50381c30 | ||
![]() |
87dd6bc30a | ||
![]() |
80cb574d1e | ||
![]() |
4d524ecf3b | ||
![]() |
f09bd8ac31 | ||
![]() |
f6ab586fdc | ||
![]() |
ccd4565418 | ||
![]() |
3cb45f56db | ||
![]() |
d264a7afba | ||
![]() |
0029cbedf6 | ||
![]() |
f986315582 | ||
![]() |
312e1ff573 | ||
![]() |
e143422d2d | ||
![]() |
1de63cc817 | ||
![]() |
894655622d | ||
![]() |
2bdf4933e2 | ||
![]() |
5fea1b9f69 | ||
![]() |
c800ac3131 | ||
![]() |
9c718c870e | ||
![]() |
c844e485b5 | ||
![]() |
fee67788a3 | ||
![]() |
284f26ddd3 | ||
![]() |
825ac9ae7d | ||
![]() |
0c2ffb3409 | ||
![]() |
632d14e592 | ||
![]() |
ab760f0fef | ||
![]() |
9b0634805a | ||
![]() |
c37eee18e6 | ||
![]() |
5f73d21b7e | ||
![]() |
25a78d4960 | ||
![]() |
66e59f6ee7 | ||
![]() |
49ee849382 | ||
![]() |
0f473232a3 | ||
![]() |
3e11bb7d04 | ||
![]() |
ea47645ca4 | ||
![]() |
ce82a96bbd | ||
![]() |
51863325a5 | ||
![]() |
34feb0f3f1 | ||
![]() |
03025b67b5 | ||
![]() |
0ce24e3199 | ||
![]() |
d638d6f030 | ||
![]() |
49bb2b50a5 | ||
![]() |
effff339e5 | ||
![]() |
f6764ee17a | ||
![]() |
3c9da5a0e9 | ||
![]() |
86daf2bd5b | ||
![]() |
fb4c505800 | ||
![]() |
050d003169 | ||
![]() |
f3a41201f2 | ||
![]() |
1cb616cff9 | ||
![]() |
c52cae72cb | ||
![]() |
854d2025f4 | ||
![]() |
294d77446b | ||
![]() |
1c5b7c18fc | ||
![]() |
1c1734bf41 | ||
![]() |
1563b0e9fb | ||
![]() |
64829a3279 | ||
![]() |
1de5fb3e6f | ||
![]() |
0b6fd75d37 | ||
![]() |
40c4579810 | ||
![]() |
ee40115bc6 | ||
![]() |
e3c45a7da7 | ||
![]() |
6651c8ed95 | ||
![]() |
82fafceba4 | ||
![]() |
45c5a620e7 | ||
![]() |
c140450a1e | ||
![]() |
a88e15c0d2 | ||
![]() |
fe3ae92a5e | ||
![]() |
fa531d70c8 | ||
![]() |
169c982b4d | ||
![]() |
0b2e0ef100 | ||
![]() |
7b8aed85a7 | ||
![]() |
6b1806465a | ||
![]() |
886f3f8c43 | ||
![]() |
3ae7d1f3f4 | ||
![]() |
a8637abfe2 | ||
![]() |
a02ca68386 | ||
![]() |
f48c6b53ee | ||
![]() |
54156ea78a | ||
![]() |
8a17ba14d6 | ||
![]() |
fa750573e2 | ||
![]() |
796331a960 | ||
![]() |
08453cfbb1 | ||
![]() |
9f7f3cb886 | ||
![]() |
6f71a2d204 | ||
![]() |
8e479f8296 | ||
![]() |
a409037a9e | ||
![]() |
9f91edc724 | ||
![]() |
64bda48b51 | ||
![]() |
5ba8b7d173 | ||
![]() |
61a71de360 | ||
![]() |
e9a7d35cb8 | ||
![]() |
23ceff950b | ||
![]() |
d0c31a0ec5 | ||
![]() |
05a42196d7 | ||
![]() |
210dcdc557 | ||
![]() |
98c88e7263 | ||
![]() |
8ef571b3c4 | ||
![]() |
bdd8c9935a | ||
![]() |
d72361ddb3 | ||
![]() |
b49a86505f | ||
![]() |
9bea6341c2 | ||
![]() |
36cce7d1be | ||
![]() |
de9de49f22 | ||
![]() |
abfdfb6a3a | ||
![]() |
f39ad1f735 | ||
![]() |
24a866fcd8 | ||
![]() |
47490ec050 | ||
![]() |
72aab9fe63 | ||
![]() |
b75dbaa3a1 | ||
![]() |
4e37202373 | ||
![]() |
f3dfd86680 | ||
![]() |
0875f79477 | ||
![]() |
d138a64a6a | ||
![]() |
45ebaa7d1a | ||
![]() |
dbad52283b | ||
![]() |
dc21386247 | ||
![]() |
2660276b4e | ||
![]() |
c37c42aaa1 | ||
![]() |
a1291faad9 | ||
![]() |
5edd41c4bf | ||
![]() |
da83cb6afe | ||
![]() |
b2e3d0c1c6 | ||
![]() |
6cc3fff57f | ||
![]() |
4e30fa78ed | ||
![]() |
4df375df32 | ||
![]() |
e79445e469 | ||
![]() |
5da4698ed9 | ||
![]() |
912fac3eb5 | ||
![]() |
4c1c12a1e0 | ||
![]() |
bafd93d38f | ||
![]() |
5b1e194a3c | ||
![]() |
4c6b356a50 | ||
![]() |
e7a0447267 | ||
![]() |
16471479d6 | ||
![]() |
c971cdaebc | ||
![]() |
3fd0e37111 | ||
![]() |
319b849532 | ||
![]() |
143c18679b | ||
![]() |
1a55dd357c | ||
![]() |
9258226e2f | ||
![]() |
60b769fbf6 | ||
![]() |
5ee0b66920 | ||
![]() |
3e37f76428 | ||
![]() |
8df235a40c | ||
![]() |
a5f9bd94ae | ||
![]() |
0b26ee0ad5 | ||
![]() |
ff34e11d52 | ||
![]() |
3744668ada | ||
![]() |
1bcdda740d | ||
![]() |
ed0cfe911f | ||
![]() |
ac9f3ca8ef | ||
![]() |
b9e5d213b2 | ||
![]() |
e048cf7ce7 | ||
![]() |
ca4eab52e4 | ||
![]() |
4b85e86cdb | ||
![]() |
b87602a32f | ||
![]() |
d49b7841cb | ||
![]() |
2add837621 | ||
![]() |
b82ec1c65e | ||
![]() |
ca51df880f | ||
![]() |
bb479db079 | ||
![]() |
075175c749 | ||
![]() |
7365ac6c64 | ||
![]() |
2504d28255 | ||
![]() |
c17549a6f2 | ||
![]() |
0b105395e9 | ||
![]() |
dd3aff90a0 | ||
![]() |
443134ca96 | ||
![]() |
07f68d1278 | ||
![]() |
bd1e313197 | ||
![]() |
0c67336f06 | ||
![]() |
ac6bca7f61 | ||
![]() |
9f0c0aa3dd | ||
![]() |
7b6ab3ba15 | ||
![]() |
275535d4d3 | ||
![]() |
bbbeab8467 | ||
![]() |
48ff755b72 | ||
![]() |
e4a8c77344 | ||
![]() |
ae95a5c408 | ||
![]() |
2f07dc3fcd | ||
![]() |
072cd8b30f | ||
![]() |
e7d8f2913c | ||
![]() |
75894399f0 | ||
![]() |
2a75a981ca | ||
![]() |
ba9df6c4f1 | ||
![]() |
7d9d8b925a | ||
![]() |
b8fad7e383 | ||
![]() |
7ac926d7da | ||
![]() |
ebb4d2e487 | ||
![]() |
1fca9855ee | ||
![]() |
742d060722 | ||
![]() |
538072069b | ||
![]() |
edd052c4dc | ||
![]() |
23edbca6cd | ||
![]() |
5c7b8e10ce | ||
![]() |
967db2a136 | ||
![]() |
9b7b534283 | ||
![]() |
c906547ea5 | ||
![]() |
dbbe5639ee | ||
![]() |
7dd4f06e71 | ||
![]() |
c1187c8fa4 | ||
![]() |
81f36d8224 | ||
![]() |
f758785cee | ||
![]() |
ffb4b5f627 | ||
![]() |
2dea3780d8 | ||
![]() |
d664b9f4ff | ||
![]() |
3f8d61c39d | ||
![]() |
f16f056e91 | ||
![]() |
dd14507d7d | ||
![]() |
cfd891bdc8 | ||
![]() |
cf40116680 | ||
![]() |
4e48e098dc | ||
![]() |
dadc27fd0c | ||
![]() |
287fda19c7 | ||
![]() |
9a2aedbac7 | ||
![]() |
38f1553315 | ||
![]() |
ec7c099384 | ||
![]() |
a4b343cc82 | ||
![]() |
4301b1f08a | ||
![]() |
df119cea1a | ||
![]() |
250b100a93 | ||
![]() |
c1f6bd171e | ||
![]() |
b37fa03846 | ||
![]() |
0e068e2fe3 | ||
![]() |
38e240e916 | ||
![]() |
34928ad5f5 | ||
![]() |
372feeeade | ||
![]() |
046c59bdb5 | ||
![]() |
e465c99e8b | ||
![]() |
1e999090e6 | ||
![]() |
efa98e558b | ||
![]() |
c4866a959c | ||
![]() |
497ac3c58b | ||
![]() |
f8f2fc1b60 | ||
![]() |
4d53df63a2 | ||
![]() |
c5aa6cb56a | ||
![]() |
ebc02f8f83 | ||
![]() |
9dafe1bd02 | ||
![]() |
d2bb33f7c5 | ||
![]() |
7618af549c | ||
![]() |
90666ef0f5 | ||
![]() |
a1096e615a | ||
![]() |
156d4de375 | ||
![]() |
719cc67cac | ||
![]() |
cd05c0f2fe | ||
![]() |
f11d7f9287 | ||
![]() |
1e6db0680a | ||
![]() |
9071d67545 | ||
![]() |
ff799c9370 | ||
![]() |
e072a42d03 | ||
![]() |
40106be208 | ||
![]() |
87af4b9d53 | ||
![]() |
18aa1f076d | ||
![]() |
cc5ce6f19c | ||
![]() |
3132ca7c0c | ||
![]() |
d75de77d6b | ||
![]() |
b394ae6350 | ||
![]() |
acc84c2459 | ||
![]() |
c6acce9906 | ||
![]() |
ebea3b7ca3 | ||
![]() |
b5af0084cb | ||
![]() |
9e28c1af63 | ||
![]() |
b6af240e97 | ||
![]() |
e7033e4d89 | ||
![]() |
00edc4205f | ||
![]() |
c2ca0c28c2 | ||
![]() |
c79192090d | ||
![]() |
58169ad7cc | ||
![]() |
dcfc5e6f15 | ||
![]() |
2ef0086394 | ||
![]() |
68b4fcb054 | ||
![]() |
ee4827e1b2 | ||
![]() |
04a8e305a2 | ||
![]() |
d2dd54acf1 | ||
![]() |
3e4d14734b | ||
![]() |
a37a1a502e | ||
![]() |
fa80c8dbfe | ||
![]() |
a846f9d92d | ||
![]() |
5bc8a895f9 | ||
![]() |
d3ef859802 | ||
![]() |
716bf87f8f | ||
![]() |
5ea572aac5 | ||
![]() |
8757a0e01a | ||
![]() |
b7bb0b056c | ||
![]() |
d4812d7b05 | ||
![]() |
29e09185eb | ||
![]() |
c9999d4cf0 | ||
![]() |
54fe8bbd8d | ||
![]() |
26d9962a98 | ||
![]() |
6b96844bae | ||
![]() |
f58febc040 | ||
![]() |
f9d6c9f1f9 | ||
![]() |
c045cf15a5 | ||
![]() |
e0d805c8f7 | ||
![]() |
6ab518faf8 | ||
![]() |
ca695a7429 | ||
![]() |
1bf395bf55 | ||
![]() |
a61a5856c0 | ||
![]() |
0093aa8185 | ||
![]() |
55cf62b49f | ||
![]() |
fbeedac497 | ||
![]() |
95be7eb185 | ||
![]() |
d3becb0ede | ||
![]() |
952f3cc311 | ||
![]() |
7cc0301fd5 | ||
![]() |
1d22c58e13 | ||
![]() |
98dc67fa4c | ||
![]() |
ac9997d970 | ||
![]() |
29d05c214a | ||
![]() |
048bbf83c5 | ||
![]() |
0c315d2c62 | ||
![]() |
3e5a3928f8 | ||
![]() |
7e474e9856 | ||
![]() |
5c58dbacce | ||
![]() |
7535583958 | ||
![]() |
9b3743db27 | ||
![]() |
88994607d9 | ||
![]() |
87a84d35a9 | ||
![]() |
bb8bf6ae7b | ||
![]() |
11353d1172 | ||
![]() |
2cc8856c6b | ||
![]() |
a3a1f50886 | ||
![]() |
566cdf8bc1 | ||
![]() |
f48b8dd6c0 | ||
![]() |
dd4aff2208 | ||
![]() |
1d6c15edd3 | ||
![]() |
9f8753bcb2 | ||
![]() |
8ada20636c | ||
![]() |
449f10970b | ||
![]() |
fbd4c1e012 | ||
![]() |
9d245fb2d6 | ||
![]() |
21ce3f40d1 | ||
![]() |
6914df3721 | ||
![]() |
8323bc3910 | ||
![]() |
56d92f3e66 | ||
![]() |
90ad3935a7 | ||
![]() |
ac2a8e2ef0 | ||
![]() |
c15bc04a4f | ||
![]() |
0384407511 | ||
![]() |
a21fc54931 | ||
![]() |
8943e6d7b0 | ||
![]() |
083827d148 | ||
![]() |
1185ba8121 | ||
![]() |
41895fe24f | ||
![]() |
54cccc79ba | ||
![]() |
30e6b9b3b4 | ||
![]() |
2aba2e3ed8 | ||
![]() |
ecd9ffd137 | ||
![]() |
78f4852d55 | ||
![]() |
92581f01b7 | ||
![]() |
85a5b5cea1 | ||
![]() |
8fbc4d125f | ||
![]() |
eb1cd7f38c | ||
![]() |
e695cf59fd | ||
![]() |
9a9cd384f1 | ||
![]() |
c9d77fdcb1 | ||
![]() |
cffc6d4693 | ||
![]() |
3df12dcb8a | ||
![]() |
669c67ad53 | ||
![]() |
106d7755d5 | ||
![]() |
ba8932cdc2 | ||
![]() |
d1c804e323 | ||
![]() |
bce580501d | ||
![]() |
453d82a0cc | ||
![]() |
eb17f80e8a | ||
![]() |
0e62e8c77a | ||
![]() |
f507e2c942 | ||
![]() |
ebd3a5078d | ||
![]() |
f9d71d67a0 | ||
![]() |
71d0f125ce | ||
![]() |
463803e2d1 | ||
![]() |
298917542f | ||
![]() |
4305a62b10 | ||
![]() |
30719ac87d | ||
![]() |
0d54352e3f | ||
![]() |
86f4b805cf | ||
![]() |
976fecdc31 | ||
![]() |
95205a01a0 | ||
![]() |
887e30f3e2 | ||
![]() |
0176543c53 | ||
![]() |
85e27511ec | ||
![]() |
7e749ff0c5 | ||
![]() |
3ce7651cd5 | ||
![]() |
666da6a11f | ||
![]() |
4c2722163d | ||
![]() |
a3a01752ff | ||
![]() |
d1d91d81d8 | ||
![]() |
9a4b829853 | ||
![]() |
e59f497ad5 | ||
![]() |
c73b6660eb | ||
![]() |
fa4ff7bced | ||
![]() |
31df9c4e67 | ||
![]() |
dd371130c1 | ||
![]() |
b28291bf57 | ||
![]() |
8bf1f718da | ||
![]() |
598cda2684 | ||
![]() |
e5486a5353 | ||
![]() |
2688bf39b4 | ||
![]() |
679d4cbae1 | ||
![]() |
7c1ed744f4 | ||
![]() |
b9f3d07455 | ||
![]() |
48b6581f12 | ||
![]() |
c8347a72ee | ||
![]() |
47f59e62dd | ||
![]() |
55fbb71a53 | ||
![]() |
8ff905f0d3 | ||
![]() |
a51713a378 | ||
![]() |
593838795c | ||
![]() |
f053188bb6 | ||
![]() |
a101c27388 | ||
![]() |
a6461853dc | ||
![]() |
41b0629e70 | ||
![]() |
2bc1e52253 | ||
![]() |
08f9c6833a | ||
![]() |
4f50861996 | ||
![]() |
fcabe88962 | ||
![]() |
a700448bad | ||
![]() |
4714f46f11 | ||
![]() |
ba8affbc7a | ||
![]() |
6d4744f93b | ||
![]() |
484faee243 | ||
![]() |
1d1ed78be7 | ||
![]() |
cf0dc3ac6b | ||
![]() |
1068e2b512 | ||
![]() |
9cb5886d6d | ||
![]() |
dbf33dba77 | ||
![]() |
de60ca30e8 | ||
![]() |
256e488d28 | ||
![]() |
69981675a5 | ||
![]() |
85d3a9d31d | ||
![]() |
a0319af7db | ||
![]() |
aed2433c4c | ||
![]() |
cc3782ecfd | ||
![]() |
f32798125b | ||
![]() |
9e7f48278a | ||
![]() |
be42e11bd2 |
@ -5,6 +5,7 @@
|
||||
**/.tox/
|
||||
**/.mypy_cache/
|
||||
**/.pytest_cache/
|
||||
**/.ruff_cache/
|
||||
**/__pycache__/
|
||||
out
|
||||
**/*.class
|
||||
@ -16,3 +17,6 @@ out
|
||||
.git/COMMIT_*
|
||||
.git/index
|
||||
.gradle
|
||||
|
||||
/metadata-ingestion/tests
|
||||
/metadata-ingestion/examples
|
||||
|
5
.github/ISSUE_TEMPLATE/--bug-report.md
vendored
5
.github/ISSUE_TEMPLATE/--bug-report.md
vendored
@ -3,8 +3,7 @@ name: "\U0001F41EBug report"
|
||||
about: Create a report to help us improve
|
||||
title: A short description of the bug
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
assignees: ""
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
@ -12,6 +11,7 @@ A clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
1. Go to '...'
|
||||
2. Click on '....'
|
||||
3. Scroll down to '....'
|
||||
@ -24,6 +24,7 @@ A clear and concise description of what you expected to happen.
|
||||
If applicable, add screenshots to help explain your problem.
|
||||
|
||||
**Desktop (please complete the following information):**
|
||||
|
||||
- OS: [e.g. iOS]
|
||||
- Browser [e.g. chrome, safari]
|
||||
- Version [e.g. 22]
|
||||
|
@ -4,7 +4,6 @@ about: Report issues found in DataHub v1.0 Release Candidates
|
||||
title: "[v1.0-rc/bug] Description of Bug"
|
||||
labels: bug, datahub-v1.0-rc
|
||||
assignees: chriscollins3456, david-leifker, maggiehays
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
@ -12,6 +11,7 @@ A clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
1. Go to '...'
|
||||
2. Click on '....'
|
||||
3. Scroll down to '....'
|
||||
@ -24,6 +24,7 @@ A clear and concise description of what you expected to happen.
|
||||
If applicable, add screenshots and/or screen recordings to help explain the issue.
|
||||
|
||||
**System details (please complete the following information):**
|
||||
|
||||
- DataHub Version Tag [e.g. v1.0-rc1]
|
||||
- OS: [e.g. iOS]
|
||||
- Browser [e.g. chrome, safari]
|
||||
|
8
.github/actionlint.yaml
vendored
Normal file
8
.github/actionlint.yaml
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
self-hosted-runner:
|
||||
labels:
|
||||
- "depot-ubuntu-22.04-small"
|
||||
- "depot-ubuntu-22.04-4"
|
||||
- "depot-ubuntu-22.04"
|
||||
- "depot-ubuntu-24.04-small"
|
||||
- "depot-ubuntu-24.04"
|
||||
- "depot-ubuntu-24.04-4"
|
7
.github/actions/ci-optimization/action.yml
vendored
7
.github/actions/ci-optimization/action.yml
vendored
@ -41,6 +41,10 @@ outputs:
|
||||
smoke-test-change:
|
||||
description: "Smoke test change"
|
||||
value: ${{ steps.filter.outputs.smoke-test == 'true' }}
|
||||
actions-change:
|
||||
description: "Actions code has changed"
|
||||
value: ${{ steps.filter.outputs.actions == 'true' }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
@ -97,3 +101,6 @@ runs:
|
||||
- "docker/elasticsearch-setup/**"
|
||||
smoke-test:
|
||||
- "smoke-test/**"
|
||||
actions:
|
||||
- "datahub-actions/**"
|
||||
- "docker/datahub-actions/**"
|
||||
|
@ -1,5 +1,5 @@
|
||||
name: 'Ensure codegen is updated'
|
||||
description: 'Will check the local filesystem against git, and abort if there are uncommitted changes.'
|
||||
name: "Ensure codegen is updated"
|
||||
description: "Will check the local filesystem against git, and abort if there are uncommitted changes."
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
7
.github/pull_request_template.md
vendored
7
.github/pull_request_template.md
vendored
@ -1,8 +1,13 @@
|
||||
<!--
|
||||
|
||||
## Checklist
|
||||
Thank you for contributing to DataHub!
|
||||
|
||||
Before you submit your PR, please go through the checklist below:
|
||||
|
||||
- [ ] The PR conforms to DataHub's [Contributing Guideline](https://github.com/datahub-project/datahub/blob/master/docs/CONTRIBUTING.md) (particularly [Commit Message Format](https://github.com/datahub-project/datahub/blob/master/docs/CONTRIBUTING.md#commit-message-format))
|
||||
- [ ] Links to related issues (if applicable)
|
||||
- [ ] Tests for the changes have been added/updated (if applicable)
|
||||
- [ ] Docs related to the changes have been added/updated (if applicable). If a new feature has been added a Usage Guide has been added for the same.
|
||||
- [ ] For any breaking change/potential downtime/deprecation/big changes an entry has been made in [Updating DataHub](https://github.com/datahub-project/datahub/blob/master/docs/how/updating-datahub.md)
|
||||
|
||||
-->
|
||||
|
48
.github/scripts/check_policies.py
vendored
48
.github/scripts/check_policies.py
vendored
@ -13,14 +13,32 @@ without_info = []
|
||||
|
||||
metadata_privileges = set()
|
||||
platform_privileges = set()
|
||||
root_user_platform_policy_privileges = set()
|
||||
root_user_all_privileges = set()
|
||||
admin_role_platform_privileges = set()
|
||||
admin_role_all_privileges = set()
|
||||
reader_role_all_privileges = set()
|
||||
editor_role_all_privileges = set()
|
||||
for policy in all_policies:
|
||||
urn = policy["urn"]
|
||||
if urn == "urn:li:dataHubPolicy:0":
|
||||
root_user_platform_policy_privileges = policy["info"]["privileges"]
|
||||
root_user_all_privileges.update(set(root_user_platform_policy_privileges))
|
||||
elif urn == "urn:li:dataHubPolicy:1":
|
||||
root_user_all_privileges.update(set(policy["info"]["privileges"]))
|
||||
elif urn == "urn:li:dataHubPolicy:admin-platform-policy":
|
||||
admin_role_platform_privileges = policy["info"]["privileges"]
|
||||
admin_role_all_privileges.update(set(admin_role_platform_privileges))
|
||||
elif urn == "urn:li:dataHubPolicy:admin-metadata-policy":
|
||||
admin_role_all_privileges.update(set(policy["info"]["privileges"]))
|
||||
elif urn == "urn:li:dataHubPolicy:editor-platform-policy":
|
||||
editor_platform_policy_privileges = policy["info"]["privileges"]
|
||||
elif urn == "urn:li:dataHubPolicy:7":
|
||||
all_user_platform_policy_privileges = policy["info"]["privileges"]
|
||||
elif urn.startswith("urn:li:dataHubPolicy:reader-"):
|
||||
reader_role_all_privileges.update(set(policy["info"]["privileges"]))
|
||||
elif urn.startswith("urn:li:dataHubPolicy:editor-"):
|
||||
editor_role_all_privileges.update(set(policy["info"]["privileges"]))
|
||||
try:
|
||||
doc_type = policy["info"]["type"]
|
||||
privileges = policy["info"]["privileges"]
|
||||
@ -49,11 +67,41 @@ print(
|
||||
"""
|
||||
)
|
||||
|
||||
# Root user has all privileges
|
||||
diff_policies = set(platform_privileges).difference(
|
||||
set(root_user_platform_policy_privileges)
|
||||
)
|
||||
assert len(diff_policies) == 0, f"Missing privileges for root user are {diff_policies}"
|
||||
|
||||
# admin role and root user have same platform privileges
|
||||
diff_root_missing_from_admin = set(root_user_platform_policy_privileges).difference(set(admin_role_platform_privileges))
|
||||
diff_admin_missing_from_root = set(admin_role_platform_privileges).difference(set(root_user_platform_policy_privileges))
|
||||
|
||||
assert len(diff_root_missing_from_admin) == 0, f"Admin role missing: {diff_root_missing_from_admin}"
|
||||
assert len(diff_admin_missing_from_root) == 0, f"Root user missing: {diff_admin_missing_from_root}"
|
||||
|
||||
# admin role and root user have same privileges
|
||||
diff_root_missing_from_admin_all = set(root_user_all_privileges).difference(set(admin_role_all_privileges))
|
||||
diff_admin_missing_from_root_all = set(admin_role_all_privileges).difference(set(root_user_all_privileges))
|
||||
## Admin user has EDIT_ENTITY privilege which is super privilege for editing entities
|
||||
diff_admin_missing_from_root_all_new = set()
|
||||
for privilege in diff_admin_missing_from_root_all:
|
||||
if privilege.startswith("EDIT_"):
|
||||
continue
|
||||
diff_admin_missing_from_root_all_new.add(privilege)
|
||||
diff_admin_missing_from_root_all = diff_admin_missing_from_root_all_new
|
||||
|
||||
assert len(diff_root_missing_from_admin_all) == 0, f"Admin role missing: {diff_root_missing_from_admin_all}"
|
||||
assert len(diff_admin_missing_from_root_all) == 0, f"Root user missing: {diff_admin_missing_from_root_all}"
|
||||
|
||||
# Editor role has all privielges of Reader
|
||||
diff_reader_missing_from_editor = set(reader_role_all_privileges).difference(set(editor_role_all_privileges))
|
||||
assert len(diff_reader_missing_from_editor) == 0, f"Editor role missing: {diff_reader_missing_from_editor}"
|
||||
|
||||
# Admin role has all privileges of editor
|
||||
diff_editor_missing_from_admin = set(editor_role_all_privileges).difference(set(admin_role_all_privileges))
|
||||
assert len(diff_editor_missing_from_admin) == 0, f"Admin role missing: {diff_editor_missing_from_admin}"
|
||||
|
||||
# All users privileges checks
|
||||
assert "MANAGE_POLICIES" not in all_user_platform_policy_privileges
|
||||
assert "MANAGE_USERS_AND_GROUPS" not in all_user_platform_policy_privileges
|
||||
|
73
.github/scripts/docker_helpers.sh
vendored
73
.github/scripts/docker_helpers.sh
vendored
@ -1,4 +1,12 @@
|
||||
echo "GITHUB_REF: $GITHUB_REF"
|
||||
#!/bin/bash
|
||||
|
||||
REF="${GITHUB_REF:-${GITHUB_REF_FALLBACK:-}}"
|
||||
if [ -z "$REF" ]; then
|
||||
echo "Error: No ref available from GITHUB_REF or fallback"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "GITHUB_REF: $REF"
|
||||
echo "GITHUB_SHA: $GITHUB_SHA"
|
||||
|
||||
export MAIN_BRANCH="master"
|
||||
@ -12,37 +20,82 @@ export SHORT_SHA=$(get_short_sha)
|
||||
echo "SHORT_SHA: $SHORT_SHA"
|
||||
|
||||
function get_tag {
|
||||
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG},g" -e 's,refs/tags/,,g' -e 's,refs/pull/\([0-9]*\).*,pr\1,g')
|
||||
echo $(echo ${REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG},g" -e 's,refs/tags/,,g' -e 's,refs/heads/,,g' -e 's,refs/heads/,,g' -e 's,refs/pull/\([0-9]*\).*,pr\1,g' -e 's,/,-,g')
|
||||
}
|
||||
|
||||
function get_tag_slim {
|
||||
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-slim,g" -e 's,refs/tags/\(.*\),\1-slim,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-slim,g')
|
||||
echo $(echo ${REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-slim,g" -e 's,refs/tags/\(.*\),\1-slim,g' -e 's,refs/heads/\(.*\),\1-slim,g' -e 's,refs/heads/\(.*\),\1-slim,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-slim,g' -e 's,/,-,g')
|
||||
}
|
||||
|
||||
function get_tag_full {
|
||||
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-full,g" -e 's,refs/tags/\(.*\),\1-full,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g')
|
||||
echo $(echo ${REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-full,g" -e 's,refs/tags/\(.*\),\1-full,g' -e 's,refs/heads/\(.*\),\1-full,g' -e 's,refs/heads/\(.*\),\1-full,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g' -e 's,/,-,g')
|
||||
}
|
||||
|
||||
function get_python_docker_release_v {
|
||||
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},1!0.0.0+docker.${SHORT_SHA},g" -e 's,refs/tags/v\(.*\),1!\1+docker,g' -e 's,refs/pull/\([0-9]*\).*,1!0.0.0+docker.pr\1,g')
|
||||
function get_python_docker_release_v() {
|
||||
echo "$(echo "${REF}" | \
|
||||
sed -e "s,refs/heads/${MAIN_BRANCH},1\!0.0.0+docker.${SHORT_SHA},g" \
|
||||
-e 's,refs/heads/\(.*\),1!0.0.0+docker.\1,g' \
|
||||
-e 's,refs/heads/\(.*\),1!0.0.0+docker.\1,g' \
|
||||
-e 's,refs/tags/v\([0-9a-zA-Z.]*\).*,\1+docker,g' \
|
||||
-e 's,refs/pull/\([0-9]*\).*,1!0.0.0+docker.pr\1,g' \
|
||||
-e 's,/,-,g'
|
||||
)"
|
||||
}
|
||||
# To run these, set TEST_DOCKER_HELPERS=1 and then copy the function + test cases into a bash shell.
|
||||
if [ ${TEST_DOCKER_HELPERS:-0} -eq 1 ]; then
|
||||
REF="refs/pull/4788/merge" get_python_docker_release_v # '1!0.0.0+docker.pr4788'
|
||||
REF="refs/tags/v0.1.2-test" get_python_docker_release_v # '0.1.2'
|
||||
REF="refs/tags/v0.1.2.1-test" get_python_docker_release_v # '0.1.2.1'
|
||||
REF="refs/tags/v0.1.2rc1-test" get_python_docker_release_v # '0.1.2rc1'
|
||||
REF="refs/heads/branch-name" get_python_docker_release_v # '1!0.0.0+docker.branch-name'
|
||||
REF="refs/heads/releases/branch-name" get_python_docker_release_v # 1!0.0.0+docker.releases-branch-name'
|
||||
|
||||
GITHUB_REF="refs/tags/v0.1.2rc1" get_tag # '0.1.2rc1'
|
||||
GITHUB_REF="refs/tags/v0.1.2rc1" get_tag_slim # '0.1.2rc1-slim'
|
||||
GITHUB_REF="refs/tags/v0.1.2rc1" get_tag_full # '0.1.2rc1-full'
|
||||
|
||||
GITHUB_REF="refs/pull/4788/merge" get_tag # 'pr4788'
|
||||
GITHUB_REF="refs/pull/4788/merge" get_tag_slim # 'pr4788-slim'
|
||||
GITHUB_REF="refs/pull/4788/merge" get_tag_full # 'pr4788-full'
|
||||
|
||||
GITHUB_REF="refs/heads/branch-name" get_tag # 'branch-name'
|
||||
GITHUB_REF="refs/heads/branch-name" get_tag_slim # 'branch-name-slim'
|
||||
GITHUB_REF="refs/heads/branch-name" get_tag_full # 'branch-name-full'
|
||||
|
||||
GITHUB_REF="refs/heads/releases/branch-name" get_tag # 'releases-branch-name'
|
||||
GITHUB_REF="refs/heads/releases/branch-name" get_tag_slim # 'releases-branch-name-slim'
|
||||
GITHUB_REF="refs/heads/releases/branch-name" get_tag_full # 'releases-branch-name-full'
|
||||
fi
|
||||
|
||||
function get_unique_tag {
|
||||
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA},g" -e 's,refs/tags/,,g' -e 's,refs/pull/\([0-9]*\).*,pr\1,g')
|
||||
echo $(echo ${REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA},g" -e 's,refs/tags/,,g' -e "s,refs/heads/.*,${SHORT_SHA},g" -e 's,refs/pull/\([0-9]*\).*,pr\1,g')
|
||||
}
|
||||
|
||||
function get_unique_tag_slim {
|
||||
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA}-slim,g" -e 's,refs/tags/\(.*\),\1-slim,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-slim,g')
|
||||
echo $(echo ${REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA}-slim,g" -e 's,refs/tags/\(.*\),\1-slim,g' -e "s,refs/heads/.*,${SHORT_SHA}-slim,g" -e 's,refs/pull/\([0-9]*\).*,pr\1-slim,g')
|
||||
}
|
||||
|
||||
function get_unique_tag_full {
|
||||
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA}-full,g" -e 's,refs/tags/\(.*\),\1-full,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g')
|
||||
echo $(echo ${REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA}-full,g" -e 's,refs/tags/\(.*\),\1-full,g' -e "s,refs/heads/.*,${SHORT_SHA}-full,g" -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g')
|
||||
}
|
||||
|
||||
function get_platforms_based_on_branch {
|
||||
if [ "${{ github.event_name }}" == 'push' && "${{ github.ref }}" == "refs/heads/${MAIN_BRANCH}" ]; then
|
||||
if [ "${GITHUB_EVENT_NAME}" == "push" ] && [ "${REF}" == "refs/heads/${MAIN_BRANCH}" ]; then
|
||||
echo "linux/amd64,linux/arm64"
|
||||
else
|
||||
echo "linux/amd64"
|
||||
fi
|
||||
}
|
||||
|
||||
function echo_tags {
|
||||
echo "short_sha=${SHORT_SHA}"
|
||||
echo "tag=$(get_tag)"
|
||||
echo "slim_tag=$(get_tag_slim)"
|
||||
echo "full_tag=$(get_tag_full)"
|
||||
echo "unique_tag=$(get_unique_tag)"
|
||||
echo "unique_slim_tag=$(get_unique_tag_slim)"
|
||||
echo "unique_full_tag=$(get_unique_tag_full)"
|
||||
echo "python_release_version=$(get_python_docker_release_v)"
|
||||
echo "branch_name=${GITHUB_HEAD_REF:-${REF#refs/heads/}}"
|
||||
echo "repository_name=${GITHUB_REPOSITORY#*/}"
|
||||
}
|
||||
|
36
.github/scripts/generate_pre_commit.py
vendored
36
.github/scripts/generate_pre_commit.py
vendored
@ -19,6 +19,7 @@ class ProjectType(Enum):
|
||||
|
||||
JAVA = auto()
|
||||
PYTHON = auto()
|
||||
PRETTIER = auto()
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -27,6 +28,8 @@ class Project:
|
||||
|
||||
path: str
|
||||
type: ProjectType
|
||||
taskName: str | None = None # Used for prettier projects
|
||||
filePattern: str | None = None # Used for prettier projects
|
||||
|
||||
@property
|
||||
def gradle_path(self) -> str:
|
||||
@ -151,8 +154,12 @@ class HookGenerator:
|
||||
for project in self.projects:
|
||||
if project.type == ProjectType.PYTHON:
|
||||
hooks.append(self._generate_lint_fix_hook(project))
|
||||
else: # ProjectType.JAVA
|
||||
elif project.type == ProjectType.JAVA:
|
||||
hooks.append(self._generate_spotless_hook(project))
|
||||
elif project.type == ProjectType.PRETTIER:
|
||||
hooks.append(self._generate_prettier_hook(project))
|
||||
else:
|
||||
print(f"Warning: Unsupported project type {project.type} for {project.path}")
|
||||
|
||||
config = {"repos": [{"repo": "local", "hooks": hooks}]}
|
||||
|
||||
@ -203,6 +210,17 @@ class HookGenerator:
|
||||
"pass_filenames": False,
|
||||
}
|
||||
|
||||
def _generate_prettier_hook(self, project: Project) -> dict:
|
||||
"""Generate a prettier hook for projects."""
|
||||
return {
|
||||
"id": f"{project.project_id}-{project.taskName}",
|
||||
"name": f"{project.taskName}",
|
||||
"entry": f"./gradlew {project.gradle_path}:{project.taskName}",
|
||||
"language": "system",
|
||||
"files": project.filePattern,
|
||||
"pass_filenames": False,
|
||||
}
|
||||
|
||||
|
||||
class PrecommitDumper(yaml.Dumper):
|
||||
"""Custom YAML dumper that maintains proper indentation."""
|
||||
@ -253,7 +271,21 @@ def main():
|
||||
|
||||
# Find projects
|
||||
finder = ProjectFinder(root_dir)
|
||||
projects = finder.find_all_projects()
|
||||
prettier_projects = [
|
||||
Project(
|
||||
path="datahub-web-react",
|
||||
type=ProjectType.PRETTIER,
|
||||
taskName="mdPrettierWriteChanged",
|
||||
filePattern="^.*\\.md$",
|
||||
),
|
||||
Project(
|
||||
path="datahub-web-react",
|
||||
type=ProjectType.PRETTIER,
|
||||
taskName="githubActionsPrettierWriteChanged",
|
||||
filePattern="^\\.github/.*\\.(yml|yaml)$"
|
||||
),
|
||||
]
|
||||
projects = [*prettier_projects, *finder.find_all_projects()]
|
||||
|
||||
# Print summary
|
||||
print("Found projects:")
|
||||
|
12
.github/scripts/pre-commit-override.yaml
vendored
12
.github/scripts/pre-commit-override.yaml
vendored
@ -7,3 +7,15 @@ repos:
|
||||
language: system
|
||||
files: ^smoke-test/tests/cypress/.*\.tsx$
|
||||
pass_filenames: false
|
||||
- id: update-capability-summary
|
||||
name: update-capability-summary
|
||||
entry: ./gradlew :metadata-ingestion:capabilitySummary
|
||||
language: system
|
||||
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
|
||||
pass_filenames: false
|
||||
- id: update-lineage-file
|
||||
name: update-lineage-file
|
||||
entry: ./gradlew :metadata-ingestion:lineageGen
|
||||
language: system
|
||||
files: ^(metadata-ingestion-modules/.*|metadata-models/.*)$
|
||||
pass_filenames: false
|
||||
|
70
.github/workflows/actions.yml
vendored
Normal file
70
.github/workflows/actions.yml
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
name: DataHub Actions
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- ".github/workflows/actions.yml"
|
||||
- "datahub-actions/**"
|
||||
- "metadata-ingestion/**"
|
||||
- "metadata-models/**"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths:
|
||||
- ".github/workflows/actions.yml"
|
||||
- "datahub-actions/**"
|
||||
- "metadata-ingestion/**"
|
||||
- "metadata-models/**"
|
||||
release:
|
||||
types: [published]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: Test packages are correct
|
||||
run: |
|
||||
cd datahub-actions;
|
||||
python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"'
|
||||
- name: Gradle build (and test)
|
||||
run: |
|
||||
./gradlew :datahub-actions:build
|
||||
|
||||
- uses: actions/upload-artifact@v4
|
||||
if: always()
|
||||
with:
|
||||
name: Test Results (build)
|
||||
path: |
|
||||
**/build/reports/tests/test/**
|
||||
**/build/test-results/test/**
|
||||
**/junit.*.xml
|
||||
- name: Upload datahub-actions coverage to Codecov
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
#handle_no_reports_found: true
|
||||
fail_ci_if_error: false
|
||||
name: datahub-actions
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Upload
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Event File
|
||||
path: ${{ github.event_path }}
|
26
.github/workflows/airflow-plugin.yml
vendored
26
.github/workflows/airflow-plugin.yml
vendored
@ -3,6 +3,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- ".github/workflows/airflow-plugin.yml"
|
||||
- "metadata-ingestion-modules/airflow-plugin/**"
|
||||
@ -34,21 +35,20 @@ jobs:
|
||||
include:
|
||||
# Note: this should be kept in sync with tox.ini.
|
||||
- python-version: "3.8"
|
||||
extra_pip_requirements: "apache-airflow~=2.3.4"
|
||||
extra_pip_extras: test-airflow23
|
||||
extra_pip_requirements: "apache-airflow~=2.7.3"
|
||||
extra_pip_constraints: "-c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.8.txt"
|
||||
- python-version: "3.10"
|
||||
extra_pip_requirements: "apache-airflow~=2.4.3"
|
||||
extra_pip_extras: test-airflow24
|
||||
extra_pip_requirements: "apache-airflow~=2.7.3"
|
||||
extra_pip_constraints: "-c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt"
|
||||
- python-version: "3.10"
|
||||
extra_pip_requirements: "apache-airflow~=2.6.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt"
|
||||
- python-version: "3.10"
|
||||
extra_pip_requirements: "apache-airflow~=2.7.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt"
|
||||
- python-version: "3.10"
|
||||
extra_pip_requirements: "apache-airflow~=2.8.1 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt"
|
||||
extra_pip_requirements: "apache-airflow~=2.8.1"
|
||||
extra_pip_constraints: "-c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt"
|
||||
- python-version: "3.11"
|
||||
extra_pip_requirements: "apache-airflow~=2.9.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.11.txt"
|
||||
extra_pip_requirements: "apache-airflow~=2.9.3"
|
||||
extra_pip_constraints: "-c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.11.txt"
|
||||
- python-version: "3.11"
|
||||
extra_pip_requirements: "apache-airflow~=2.10.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.10.3/constraints-3.11.txt"
|
||||
extra_pip_requirements: "apache-airflow~=2.10.3"
|
||||
extra_pip_constraints: "-c https://raw.githubusercontent.com/apache/airflow/constraints-2.10.3/constraints-3.11.txt"
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Set up JDK 17
|
||||
@ -65,7 +65,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: ./metadata-ingestion/scripts/install_deps.sh
|
||||
- name: Install airflow package and test (extras ${{ matrix.extra_pip_requirements }})
|
||||
run: ./gradlew -Pextra_pip_requirements='${{ matrix.extra_pip_requirements }}' -Pextra_pip_extras='${{ matrix.extra_pip_extras }}' :metadata-ingestion-modules:airflow-plugin:build
|
||||
run: ./gradlew -Pextra_pip_requirements='${{ matrix.extra_pip_requirements }}' -Pextra_pip_constraints='${{ matrix.extra_pip_constraints }}' -Pextra_pip_extras='${{ matrix.extra_pip_extras }}' :metadata-ingestion-modules:airflow-plugin:build
|
||||
- name: pip freeze show list installed
|
||||
if: always()
|
||||
run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && uv pip freeze
|
||||
@ -88,11 +88,13 @@ jobs:
|
||||
flags: ingestion-airflow
|
||||
name: pytest-airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_requirements }}
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
- name: Upload test results to Codecov
|
||||
if: ${{ !cancelled() }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
|
62
.github/workflows/build-and-test.yml
vendored
62
.github/workflows/build-and-test.yml
vendored
@ -3,16 +3,16 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths-ignore:
|
||||
- "docs/**"
|
||||
- "**.md"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths-ignore:
|
||||
- "docs/**"
|
||||
- "**.md"
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # Run at midnight UTC every day
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
@ -24,10 +24,10 @@ jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' }}
|
||||
frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' || github.event_name != 'pull_request' }}
|
||||
ingestion_change: ${{ steps.ci-optimize.outputs.ingestion-change == 'true' }}
|
||||
backend_change: ${{ steps.ci-optimize.outputs.backend-change == 'true' }}
|
||||
docker_change: ${{ steps.ci-optimize.outputs.docker-change == 'true' }}
|
||||
backend_change: ${{ steps.ci-optimize.outputs.backend-change == 'true' || github.event_name != 'pull_request'}}
|
||||
docker_change: ${{ steps.ci-optimize.outputs.docker-change == 'true' || github.event_name != 'pull_request' }}
|
||||
frontend_only: ${{ steps.ci-optimize.outputs.frontend-only == 'true' }}
|
||||
ingestion_only: ${{ steps.ci-optimize.outputs.ingestion-only == 'true' }}
|
||||
kafka_setup_change: ${{ steps.ci-optimize.outputs.kafka-setup-change == 'true' }}
|
||||
@ -106,17 +106,21 @@ jobs:
|
||||
-x :datahub-web-react:build \
|
||||
-x :metadata-integration:java:datahub-schematron:cli:test \
|
||||
--parallel
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
- name: Gradle build (and test) for frontend
|
||||
if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}
|
||||
run: |
|
||||
./gradlew :datahub-frontend:build :datahub-web-react:build --parallel
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
- name: Gradle compile (jdk8) for legacy Spark
|
||||
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
|
||||
run: |
|
||||
./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava
|
||||
- name: Gather coverage files
|
||||
run: |
|
||||
echo "BACKEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(metadata-models|entity-registry|datahuyb-graphql-core|metadata-io|metadata-jobs|metadata-utils|metadata-service|medata-dao-impl|metadata-operation|li-utils|metadata-integration|metadata-events|metadata-auth|ingestion-scheduler|notifications|datahub-upgrade)' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
|
||||
echo "BACKEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(metadata-models|entity-registry|datahub-graphql-core|metadata-io|metadata-jobs|metadata-utils|metadata-service|medata-dao-impl|metadata-operation|li-utils|metadata-integration|metadata-events|metadata-auth|ingestion-scheduler|notifications|datahub-upgrade)' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
|
||||
echo "FRONTEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(datahub-frontend|datahub-web-react).*\.(xml|json)$' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
|
||||
- name: Generate tz artifact name
|
||||
run: echo "NAME_TZ=$(echo ${{ matrix.timezone }} | tr '/' '-')" >> $GITHUB_ENV
|
||||
@ -132,7 +136,7 @@ jobs:
|
||||
- name: Ensure codegen is updated
|
||||
uses: ./.github/actions/ensure-codegen-updated
|
||||
- name: Upload backend coverage to Codecov
|
||||
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
|
||||
if: ${{ (matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' && github.event_name != 'release') }}
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@ -143,8 +147,22 @@ jobs:
|
||||
flags: backend
|
||||
name: ${{ matrix.command }}
|
||||
verbose: true
|
||||
- name: Upload backend coverage to Codecov on release
|
||||
if: ${{ (matrix.command == 'except_metadata_ingestion' && github.event_name == 'release' ) }}
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
files: ${{ env.BACKEND_FILES }}
|
||||
disable_search: true
|
||||
#handle_no_reports_found: true
|
||||
fail_ci_if_error: false
|
||||
flags: backend
|
||||
name: ${{ matrix.command }}
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Upload frontend coverage to Codecov
|
||||
if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}
|
||||
if: ${{ (matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' && github.event_name != 'release') }}
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@ -155,13 +173,33 @@ jobs:
|
||||
flags: frontend
|
||||
name: ${{ matrix.command }}
|
||||
verbose: true
|
||||
|
||||
- name: Upload frontend coverage to Codecov on Release
|
||||
if: ${{ (matrix.command == 'frontend' && github.event_name == 'release') }}
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
files: ${{ env.FRONTEND_FILES }}
|
||||
disable_search: true
|
||||
#handle_no_reports_found: true
|
||||
fail_ci_if_error: false
|
||||
flags: frontend
|
||||
name: ${{ matrix.command }}
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
- name: Upload test results to Codecov
|
||||
if: ${{ !cancelled() }}
|
||||
if: ${{ !cancelled() && github.event_name != 'release' }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
- name: Upload test results to Codecov on release
|
||||
if: ${{ !cancelled() && github.event_name == 'release' }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
quickstart-compose-validation:
|
||||
docker-codegen-validation:
|
||||
runs-on: ubuntu-latest
|
||||
needs: setup
|
||||
if: ${{ needs.setup.outputs.docker_change == 'true' }}
|
||||
@ -173,6 +211,8 @@ jobs:
|
||||
python-version: "3.10"
|
||||
- name: Quickstart Compose Validation
|
||||
run: ./docker/quickstart/generate_and_compare.sh
|
||||
- name: Docker Snippet Validation
|
||||
run: python python-build/generate_ingestion_docker.py --check
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
|
1
.github/workflows/check-datahub-jars.yml
vendored
1
.github/workflows/check-datahub-jars.yml
vendored
@ -4,6 +4,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- "metadata-integration/**"
|
||||
pull_request:
|
||||
|
2
.github/workflows/close-stale-issues.yml
vendored
2
.github/workflows/close-stale-issues.yml
vendored
@ -19,7 +19,7 @@ jobs:
|
||||
days-before-issue-close: 30
|
||||
stale-issue-label: "stale"
|
||||
stale-issue-message:
|
||||
"This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io.\
|
||||
"This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://datahub.com/slack.\
|
||||
\ For feature requests please use https://feature-requests.datahubproject.io"
|
||||
close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale."
|
||||
days-before-pr-stale: -1
|
||||
|
3
.github/workflows/dagster-plugin.yml
vendored
3
.github/workflows/dagster-plugin.yml
vendored
@ -3,6 +3,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- ".github/workflows/dagster-plugin.yml"
|
||||
- "metadata-ingestion-modules/dagster-plugin/**"
|
||||
@ -75,11 +76,13 @@ jobs:
|
||||
flags: ingestion-dagster-plugin
|
||||
name: pytest-dagster
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
- name: Upload test results to Codecov
|
||||
if: ${{ !cancelled() }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
|
3
.github/workflows/docker-ingestion-smoke.yml
vendored
3
.github/workflows/docker-ingestion-smoke.yml
vendored
@ -28,6 +28,9 @@ jobs:
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- name: Compute Tag
|
||||
id: tag
|
||||
env:
|
||||
GITHUB_REF_FALLBACK: ${{ github.ref }}
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
run: |
|
||||
source .github/scripts/docker_helpers.sh
|
||||
echo "tag=$(get_tag)" >> $GITHUB_OUTPUT
|
||||
|
61
.github/workflows/docker-postgres-setup.yml
vendored
61
.github/workflows/docker-postgres-setup.yml
vendored
@ -1,61 +0,0 @@
|
||||
name: postgres-setup docker
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- "docker/postgres-setup/**"
|
||||
- ".github/workflows/docker-postgres-setup.yml"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths:
|
||||
- "docker/postgres-setup/**"
|
||||
- ".github/workflows/docker-postgres-setup.yml"
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tag: ${{ steps.tag.outputs.tag }}
|
||||
publish: ${{ steps.publish.outputs.publish }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- name: Compute Tag
|
||||
id: tag
|
||||
run: |
|
||||
source .github/scripts/docker_helpers.sh
|
||||
echo "tag=$(get_tag)" >> $GITHUB_OUTPUT
|
||||
- name: Check whether publishing enabled
|
||||
id: publish
|
||||
env:
|
||||
ENABLE_PUBLISH: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
|
||||
run: |
|
||||
echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}"
|
||||
echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT
|
||||
push_to_registries:
|
||||
name: Build and Push Docker Image to Docker Hub
|
||||
runs-on: ubuntu-latest
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- name: Build and push
|
||||
uses: ./.github/actions/docker-custom-build-and-push
|
||||
with:
|
||||
images: |
|
||||
acryldata/datahub-postgres-setup
|
||||
image_tag: ${{ needs.setup.outputs.tag }}
|
||||
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
|
||||
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
|
||||
publish: ${{ needs.setup.outputs.publish == 'true' }}
|
||||
context: .
|
||||
file: ./docker/postgres-setup/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
1260
.github/workflows/docker-unified.yml
vendored
1260
.github/workflows/docker-unified.yml
vendored
File diff suppressed because it is too large
Load Diff
3
.github/workflows/documentation.yml
vendored
3
.github/workflows/documentation.yml
vendored
@ -53,6 +53,9 @@ jobs:
|
||||
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
|
||||
- name: Install Python dependencies
|
||||
run: ./metadata-ingestion/scripts/install_deps.sh
|
||||
- name: Run tests
|
||||
run: |
|
||||
./gradlew --info :metadata-ingestion:testScripts
|
||||
- name: Build Docs
|
||||
run: |
|
||||
./gradlew --info docs-website:build
|
||||
|
30
.github/workflows/github-actions-format.yml
vendored
Normal file
30
.github/workflows/github-actions-format.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: github actions format
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- ".github/**/*.{yml,yaml}"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths:
|
||||
- ".github/**/*.{yml,yaml}"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
github_actions_format_check:
|
||||
name: github_actions_format_check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: run prettier --check
|
||||
run: |-
|
||||
./gradlew :datahub-web-react:githubActionsPrettierCheck
|
3
.github/workflows/gx-plugin.yml
vendored
3
.github/workflows/gx-plugin.yml
vendored
@ -3,6 +3,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- ".github/workflows/gx-plugin.yml"
|
||||
- "metadata-ingestion-modules/gx-plugin/**"
|
||||
@ -79,11 +80,13 @@ jobs:
|
||||
flags: ingestion-gx-plugin
|
||||
name: pytest-gx
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
- name: Upload test results to Codecov
|
||||
if: ${{ !cancelled() }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
|
30
.github/workflows/markdown-format.yml
vendored
Normal file
30
.github/workflows/markdown-format.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: markdown format
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- "**/*.md"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths:
|
||||
- "**/*.md"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
markdown_format_check:
|
||||
name: markdown_format_check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: run prettier --check
|
||||
run: |-
|
||||
./gradlew :datahub-web-react:mdPrettierCheck
|
23
.github/workflows/metadata-ingestion.yml
vendored
23
.github/workflows/metadata-ingestion.yml
vendored
@ -3,6 +3,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- ".github/workflows/metadata-ingestion.yml"
|
||||
- "metadata-ingestion/**"
|
||||
@ -68,9 +69,25 @@ jobs:
|
||||
run: ./metadata-ingestion/scripts/install_deps.sh
|
||||
- name: Install package
|
||||
run: ./gradlew :metadata-ingestion:installPackageOnly
|
||||
- name: Run lint alongwith testQuick
|
||||
- name: Check lint passes and autogenerated JSON files are up-to-date
|
||||
if: ${{ matrix.command == 'testQuick' }}
|
||||
run: ./gradlew :metadata-ingestion:lint
|
||||
run: |
|
||||
./gradlew :metadata-ingestion:lint
|
||||
- name: Check autogenerated JSON files are up-to-date
|
||||
if: ${{ matrix.command == 'testQuick' }}
|
||||
run: |
|
||||
./gradlew :metadata-ingestion:capabilitySummary :metadata-ingestion:lineageGen
|
||||
for json_file in metadata-ingestion/src/datahub/ingestion/autogenerated/*.json; do
|
||||
filename=$(basename "$json_file")
|
||||
if git diff --quiet "$json_file"; then
|
||||
echo "✅ $filename is unchanged"
|
||||
else
|
||||
echo "❌ $filename has changed. Please commit the updated file."
|
||||
echo "Changed lines:"
|
||||
git diff "$json_file"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
- name: Run metadata-ingestion tests
|
||||
run: ./gradlew :metadata-ingestion:${{ matrix.command }}
|
||||
- name: Debug info
|
||||
@ -99,11 +116,13 @@ jobs:
|
||||
flags: ingestion
|
||||
name: pytest-${{ matrix.python-version }}-${{ matrix.command }}
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
- name: Upload test results to Codecov
|
||||
if: ${{ !cancelled() }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
|
13
.github/workflows/metadata-io.yml
vendored
13
.github/workflows/metadata-io.yml
vendored
@ -3,6 +3,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- "**/*.gradle"
|
||||
- "li-utils/**"
|
||||
@ -30,9 +31,9 @@ jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' }}
|
||||
ingestion_change: ${{ steps.ci-optimize.outputs.ingestion-change == 'true' }}
|
||||
backend_change: ${{ steps.ci-optimize.outputs.backend-change == 'true' }}
|
||||
frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' || github.event_name == 'release' }}
|
||||
ingestion_change: ${{ steps.ci-optimize.outputs.ingestion-change == 'true' || github.event_name == 'release' }}
|
||||
backend_change: ${{ steps.ci-optimize.outputs.backend-change == 'true' || github.event_name == 'release' }}
|
||||
docker_change: ${{ steps.ci-optimize.outputs.docker-change == 'true' }}
|
||||
frontend_only: ${{ steps.ci-optimize.outputs.frontend-only == 'true' }}
|
||||
ingestion_only: ${{ steps.ci-optimize.outputs.ingestion-only == 'true' }}
|
||||
@ -58,10 +59,6 @@ jobs:
|
||||
- name: Disk Check
|
||||
run: df -h . && docker images
|
||||
- uses: acryldata/sane-checkout-action@v3
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
cache: "pip"
|
||||
- name: Set up JDK 17
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
@ -92,11 +89,13 @@ jobs:
|
||||
flags: metadata-io
|
||||
name: metadata-io-test
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
- name: Upload test results to Codecov
|
||||
if: ${{ !cancelled() }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
|
17
.github/workflows/pr-labeler.yml
vendored
17
.github/workflows/pr-labeler.yml
vendored
@ -19,9 +19,8 @@ jobs:
|
||||
repo-token: "${{ secrets.GITHUB_TOKEN }}"
|
||||
configuration-path: ".github/pr-labeler-config.yml"
|
||||
- uses: actions-ecosystem/action-add-labels@v1.1.3
|
||||
# only add names of Acryl Data team members here
|
||||
if:
|
||||
${{
|
||||
# only add names of DataHub team members here
|
||||
if: ${{
|
||||
!contains(
|
||||
fromJson('[
|
||||
"anshbansal",
|
||||
@ -52,7 +51,14 @@ jobs:
|
||||
"chakru-r",
|
||||
"brock-acryl",
|
||||
"mminichino",
|
||||
"jayacryl"
|
||||
"jayacryl",
|
||||
"v-tarasevich-blitz-brain",
|
||||
"ryota-cloud",
|
||||
"annadoesdesign",
|
||||
"jmacryl",
|
||||
"esteban",
|
||||
"anthonyburdi",
|
||||
"ligfx"
|
||||
]'),
|
||||
github.actor
|
||||
)
|
||||
@ -63,8 +69,7 @@ jobs:
|
||||
community-contribution
|
||||
- uses: actions-ecosystem/action-add-labels@v1.1.3
|
||||
# only add names of champions here. Confirm with DevRel Team
|
||||
if:
|
||||
${{
|
||||
if: ${{
|
||||
contains(
|
||||
fromJson('[
|
||||
"siladitya2",
|
||||
|
3
.github/workflows/prefect-plugin.yml
vendored
3
.github/workflows/prefect-plugin.yml
vendored
@ -3,6 +3,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- ".github/workflows/prefect-plugin.yml"
|
||||
- "metadata-ingestion-modules/prefect-plugin/**"
|
||||
@ -71,11 +72,13 @@ jobs:
|
||||
flags: ingestion-prefect-plugin
|
||||
name: pytest-prefect-${{ matrix.python-version }}
|
||||
verbose: true
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
- name: Upload test results to Codecov
|
||||
if: ${{ !cancelled() }}
|
||||
uses: codecov/test-results-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
override_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
event-file:
|
||||
runs-on: ubuntu-latest
|
||||
|
10
.github/workflows/publish-datahub-jars.yml
vendored
10
.github/workflows/publish-datahub-jars.yml
vendored
@ -27,8 +27,13 @@ jobs:
|
||||
env:
|
||||
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
||||
run: |
|
||||
echo "Enable publish: ${{ env.SIGNING_KEY != '' }}"
|
||||
if [[ "${{ github.repository }}" == "acryldata/datahub" ]]; then
|
||||
echo "Enable publish for main repository: ${{ env.SIGNING_KEY != '' }}"
|
||||
echo "publish=${{ env.SIGNING_KEY != '' }}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "Skipping publish for repository: ${{ github.repository }}"
|
||||
echo "publish=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
setup:
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
runs-on: ubuntu-latest
|
||||
@ -39,6 +44,9 @@ jobs:
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- name: Compute Tag
|
||||
id: tag
|
||||
env:
|
||||
GITHUB_REF_FALLBACK: ${{ github.ref }}
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
run: |
|
||||
source .github/scripts/docker_helpers.sh
|
||||
TAG=$(echo ${GITHUB_REF} | sed -e 's,refs/tags/v,,g')
|
||||
|
4
.github/workflows/python-build-pages.yml
vendored
4
.github/workflows/python-build-pages.yml
vendored
@ -6,16 +6,20 @@ on:
|
||||
paths:
|
||||
- ".github/workflows/python-build-pages.yml"
|
||||
- "metadata-ingestion/**"
|
||||
- "datahub-actions/**"
|
||||
- "metadata-ingestion-modules/**"
|
||||
- "metadata-models/**"
|
||||
- "python-build/**"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths:
|
||||
- ".github/workflows/python-build-pages.yml"
|
||||
- "metadata-ingestion/**"
|
||||
- "datahub-actions/**"
|
||||
- "metadata-ingestion-modules/**"
|
||||
- "metadata-models/**"
|
||||
- "python-build/**"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
|
63
.github/workflows/react-cloudflare-pages.yml
vendored
Normal file
63
.github/workflows/react-cloudflare-pages.yml
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
name: Frontend Preview
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths-ignore:
|
||||
- "docs/**"
|
||||
- "**.md"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths-ignore:
|
||||
- "docs/**"
|
||||
- "**.md"
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' }}
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- uses: ./.github/actions/ci-optimization
|
||||
id: ci-optimize
|
||||
|
||||
deploy:
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: read
|
||||
deployments: write
|
||||
timeout-minutes: 30
|
||||
needs: setup
|
||||
if: ${{ github.event.pull_request.head.repo.fork != 'true' }}
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- name: Set up JDK 17
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
distribution: "zulu"
|
||||
java-version: 17
|
||||
- uses: gradle/gradle-build-action@v3
|
||||
- name: Gradle build for frontend
|
||||
if: ${{ needs.setup.outputs.frontend_change == 'true' }}
|
||||
run: |
|
||||
./gradlew :datahub-web-react:build -x test -x check --parallel
|
||||
- name: Publish
|
||||
if: ${{ needs.setup.outputs.frontend_change == 'true' }}
|
||||
uses: cloudflare/pages-action@1
|
||||
with:
|
||||
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
|
||||
accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
|
||||
projectName: datahub-project-web-react
|
||||
workingDirectory: datahub-web-react
|
||||
directory: dist
|
||||
gitHubToken: ${{ secrets.GITHUB_TOKEN }}
|
1
.github/workflows/spark-smoke-test.yml
vendored
1
.github/workflows/spark-smoke-test.yml
vendored
@ -5,6 +5,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- releases/**
|
||||
paths:
|
||||
- "metadata_models/**"
|
||||
- "metadata-integration/java/datahub-client/**"
|
||||
|
10
.github/workflows/test-results.yml
vendored
10
.github/workflows/test-results.yml
vendored
@ -2,7 +2,15 @@ name: Test Results
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["build & test", "metadata ingestion", "Airflow Plugin", "Dagster Plugin", "Prefect Plugin", "GX Plugin"]
|
||||
workflows:
|
||||
[
|
||||
"build & test",
|
||||
"metadata ingestion",
|
||||
"Airflow Plugin",
|
||||
"Dagster Plugin",
|
||||
"Prefect Plugin",
|
||||
"GX Plugin",
|
||||
]
|
||||
types:
|
||||
- completed
|
||||
|
||||
|
30
.github/workflows/yaml-format.yml
vendored
Normal file
30
.github/workflows/yaml-format.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: yaml format
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- "**/*.{yml,yaml}"
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths:
|
||||
- "**/*.{yml,yaml}"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
yaml_format_check:
|
||||
name: yaml_format_check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: acryldata/sane-checkout-action@v3
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: run prettier --check
|
||||
run: |-
|
||||
./gradlew :datahub-web-react:githubActionsPrettierCheck
|
@ -1,9 +1,30 @@
|
||||
# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-02-11 10:00:11 UTC
|
||||
# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-07-01 10:36:31 UTC
|
||||
# Do not edit this file directly. Run the script to regenerate.
|
||||
# Add additional hooks in .github/scripts/pre-commit-override.yaml
|
||||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: datahub-web-react-mdPrettierWriteChanged
|
||||
name: mdPrettierWriteChanged
|
||||
entry: ./gradlew :datahub-web-react:mdPrettierWriteChanged
|
||||
language: system
|
||||
files: ^.*\.md$
|
||||
pass_filenames: false
|
||||
|
||||
- id: datahub-web-react-githubActionsPrettierWriteChanged
|
||||
name: githubActionsPrettierWriteChanged
|
||||
entry: ./gradlew :datahub-web-react:githubActionsPrettierWriteChanged
|
||||
language: system
|
||||
files: ^\.github/.*\.(yml|yaml)$
|
||||
pass_filenames: false
|
||||
|
||||
- id: datahub-actions-lint-fix
|
||||
name: datahub-actions Lint Fix
|
||||
entry: ./gradlew :datahub-actions:lintFix
|
||||
language: system
|
||||
files: ^datahub-actions/.*\.py$
|
||||
pass_filenames: false
|
||||
|
||||
- id: datahub-graphql-core-spotless
|
||||
name: datahub-graphql-core Spotless Apply
|
||||
entry: ./gradlew :datahub-graphql-core:spotlessApply
|
||||
@ -53,6 +74,13 @@ repos:
|
||||
files: ^metadata-dao-impl/kafka-producer/.*\.java$
|
||||
pass_filenames: false
|
||||
|
||||
- id: metadata-events-mxe-avro-spotless
|
||||
name: metadata-events/mxe-avro Spotless Apply
|
||||
entry: ./gradlew :metadata-events:mxe-avro:spotlessApply
|
||||
language: system
|
||||
files: ^metadata-events/mxe-avro/.*\.java$
|
||||
pass_filenames: false
|
||||
|
||||
- id: metadata-events-mxe-registration-spotless
|
||||
name: metadata-events/mxe-registration Spotless Apply
|
||||
entry: ./gradlew :metadata-events:mxe-registration:spotlessApply
|
||||
@ -291,6 +319,13 @@ repos:
|
||||
files: ^metadata-service/configuration/.*\.java$
|
||||
pass_filenames: false
|
||||
|
||||
- id: metadata-service-events-service-spotless
|
||||
name: metadata-service/events-service Spotless Apply
|
||||
entry: ./gradlew :metadata-service:events-service:spotlessApply
|
||||
language: system
|
||||
files: ^metadata-service/events-service/.*\.java$
|
||||
pass_filenames: false
|
||||
|
||||
- id: metadata-service-factories-spotless
|
||||
name: metadata-service/factories Spotless Apply
|
||||
entry: ./gradlew :metadata-service:factories:spotlessApply
|
||||
@ -458,3 +493,17 @@ repos:
|
||||
language: system
|
||||
files: ^smoke-test/tests/cypress/.*\.tsx$
|
||||
pass_filenames: false
|
||||
|
||||
- id: update-capability-summary
|
||||
name: update-capability-summary
|
||||
entry: ./gradlew :metadata-ingestion:capabilitySummary
|
||||
language: system
|
||||
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
|
||||
pass_filenames: false
|
||||
|
||||
- id: update-lineage-file
|
||||
name: update-lineage-file
|
||||
entry: ./gradlew :metadata-ingestion:lineageGen
|
||||
language: system
|
||||
files: ^(metadata-ingestion-modules/.*|metadata-models/.*)$
|
||||
pass_filenames: false
|
||||
|
40
CLAUDE.MD
Normal file
40
CLAUDE.MD
Normal file
@ -0,0 +1,40 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) or any other agent when working with code in this repository.
|
||||
|
||||
## Coding conventions
|
||||
|
||||
- Keep code maintainable. This is not throw-away code. This goes to production.
|
||||
- Generate unit tests where appropriate.
|
||||
- Do not start generating random scripts to run the code you generated unless asked for.
|
||||
- Do not add comments which are redundant given the function names
|
||||
|
||||
## Core concept docs
|
||||
|
||||
- `docs/what/urn.md` defines what a URN is
|
||||
|
||||
## Overall Directory structure
|
||||
|
||||
- This is repository for DataHub project.
|
||||
- `README.MD` should give some basic information about the project.
|
||||
- This is a multi-project gradle project so you will find a lot of `build.gradle` in most folders
|
||||
|
||||
### metadata-ingestion module details
|
||||
- `metadata-ingestion` contains source and tests for DataHub OSS CLI.
|
||||
- `metadata-ingestion/developing.md` contains details about the environment used for testing.
|
||||
- `.github/workflows/metadata-ingestion.yml` contains our github workflow that is used in CI.
|
||||
- `metadata-ingestion/build.gradle` contains our build.gradle that has gradle tasks defined for this module
|
||||
- `pyproject.toml`, `setup.py`, `setup.cfg` in the folder contain rules about the code style for the repository
|
||||
- The `.md` files at top level in this folder gives you important information about the concepts of ingestion
|
||||
- You can see examples of how to define various aspect types in `metadata-ingestion/src/datahub/emitter/mcp_builder.py`
|
||||
- Source code goes in `metadata-ingestion/src/`
|
||||
- Tests go in `metadata-ingestion/tests/` (not in `src/`)
|
||||
- **Testing conventions for metadata-ingestion**:
|
||||
- Unit tests: `metadata-ingestion/tests/unit/`
|
||||
- Integration tests: `metadata-ingestion/tests/integration/`
|
||||
- Test files should mirror the source directory structure
|
||||
- Use pytest, not unittest
|
||||
- Use `assert` statements, not `self.assertEqual()` or `self.assertIsNone()`
|
||||
- Use regular classes, not `unittest.TestCase`
|
||||
- Import `pytest` in test files
|
||||
- Test files should be named `test_*.py` and placed in the appropriate test directory, not alongside source files
|
49
README.md
49
README.md
@ -18,7 +18,7 @@ export const Logo = (props) => {
|
||||
<!--
|
||||
HOSTED_DOCS_ONLY-->
|
||||
<p align="center">
|
||||
<a href="https://datahubproject.io">
|
||||
<a href="https://datahub.com">
|
||||
<img alt="DataHub" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/datahub-logo-color-mark.svg" height="150" />
|
||||
</a>
|
||||
</p>
|
||||
@ -26,7 +26,7 @@ HOSTED_DOCS_ONLY-->
|
||||
|
||||
# DataHub: The Data Discovery Platform for the Modern Data Stack
|
||||
|
||||
### Built with ❤️ by <img src="https://datahubproject.io/img/acryl-logo-light-mark.png" width="20"/> [Acryl Data](https://acryldata.io) and <img src="https://datahubproject.io/img/LI-In-Bug.png" width="20"/> [LinkedIn](https://engineering.linkedin.com)
|
||||
### Built with ❤️ by <img src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/datahub-logo-color-mark.svg" width="20"/> [DataHub](https://datahub.com) and <img src="https://docs.datahub.com/img/LI-In-Bug.png" width="20"/> [LinkedIn](https://engineering.linkedin.com)
|
||||
|
||||
<div>
|
||||
<a target="_blank" href="https://github.com/datahub-project/datahub/blob/master/LICENSE">
|
||||
@ -36,11 +36,11 @@ HOSTED_DOCS_ONLY-->
|
||||
<a target="_blank" href="https://github.com/datahub-project/datahub/pulse">
|
||||
<img alt="GitHub commit activity" src="https://img.shields.io/github/commit-activity/m/datahub-project/datahub?label=commits&labelColor=133554&color=1890ff" /></a>
|
||||
<br />
|
||||
<a target="_blank" href="https://pages.acryl.io/slack?utm_source=github&utm_medium=readme&utm_campaign=github_readme">
|
||||
<a target="_blank" href="https://datahub.com/slack?utm_source=github&utm_medium=readme&utm_campaign=github_readme">
|
||||
<img alt="Slack" src="https://img.shields.io/badge/slack-join_community-red.svg?logo=slack&labelColor=133554&color=1890ff" /></a>
|
||||
<a href="https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w">
|
||||
<img alt="YouTube" src="https://img.shields.io/youtube/channel/subscribers/UC3qFQC5IiwR5fvWEqi_tJ5w?style=flat&logo=youtube&label=subscribers&labelColor=133554&color=1890ff"/></a>
|
||||
<a href="https://blog.datahubproject.io/">
|
||||
<a href="https://medium.com/datahub-project/">
|
||||
<img alt="Medium" src="https://img.shields.io/badge/blog-DataHub-red.svg?style=flat&logo=medium&logoColor=white&labelColor=133554&color=1890ff" /></a>
|
||||
<a href="https://x.com/datahubproject">
|
||||
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/badge/follow-datahubproject-red.svg?style=flat&logo=x&labelColor=133554&color=1890ff" /></a>
|
||||
@ -48,26 +48,26 @@ HOSTED_DOCS_ONLY-->
|
||||
|
||||
---
|
||||
|
||||
### 🏠 Docs: [datahubproject.io](https://datahubproject.io/docs)
|
||||
### 🏠 Docs: [docs.datahub.com](https://docs.datahub.com/)
|
||||
|
||||
[Quickstart](https://datahubproject.io/docs/quickstart) |
|
||||
[Features](https://datahubproject.io/docs/) |
|
||||
[Quickstart](https://docs.datahub.com/docs/quickstart) |
|
||||
[Features](https://docs.datahub.com/docs/features) |
|
||||
[Roadmap](https://feature-requests.datahubproject.io/roadmap) |
|
||||
[Adoption](#adoption) |
|
||||
[Demo](https://demo.datahubproject.io/) |
|
||||
[Town Hall](https://datahubproject.io/docs/townhalls)
|
||||
[Demo](https://demo.datahub.com/) |
|
||||
[Town Hall](https://docs.datahub.com/docs/townhalls)
|
||||
|
||||
---
|
||||
|
||||
> 📣 DataHub Town Hall is the 4th Thursday at 9am US PT of every month - [add it to your calendar!](https://rsvp.datahubproject.io/)
|
||||
> 📣 DataHub Town Hall is the 4th Thursday at 9am US PT of every month - [add it to your calendar!](https://lu.ma/datahubevents/)
|
||||
>
|
||||
> - Town-hall Zoom link: [zoom.datahubproject.io](https://zoom.datahubproject.io)
|
||||
> - [Meeting details](docs/townhalls.md) & [past recordings](docs/townhall-history.md)
|
||||
|
||||
> ✨ DataHub Community Highlights:
|
||||
>
|
||||
> - Read our Monthly Project Updates [here](https://blog.datahubproject.io/tagged/project-updates).
|
||||
> - Bringing The Power Of The DataHub Real-Time Metadata Graph To Everyone At Acryl Data: [Data Engineering Podcast](https://www.dataengineeringpodcast.com/acryl-data-datahub-metadata-graph-episode-230/)
|
||||
> - Read our Monthly Project Updates [here](https://medium.com/datahub-project/tagged/project-updates).
|
||||
> - Bringing The Power Of The DataHub Real-Time Metadata Graph To Everyone At DataHub: [Data Engineering Podcast](https://www.dataengineeringpodcast.com/acryl-data-datahub-metadata-graph-episode-230/)
|
||||
> - Check out our most-read blog post, [DataHub: Popular Metadata Architectures Explained](https://engineering.linkedin.com/blog/2020/datahub-popular-metadata-architectures-explained) @ LinkedIn Engineering Blog.
|
||||
> - Join us on [Slack](docs/slack.md)! Ask questions and keep up with the latest announcements.
|
||||
|
||||
@ -82,18 +82,18 @@ Check out DataHub's [Features](docs/features.md) & [Roadmap](https://feature-req
|
||||
|
||||
## Demo and Screenshots
|
||||
|
||||
There's a [hosted demo environment](https://demo.datahubproject.io/) courtesy of [Acryl Data](https://acryldata.io) where you can explore DataHub without installing it locally.
|
||||
There's a [hosted demo environment](https://demo.datahub.com/) courtesy of DataHub where you can explore DataHub without installing it locally.
|
||||
|
||||
## Quickstart
|
||||
|
||||
Please follow the [DataHub Quickstart Guide](https://datahubproject.io/docs/quickstart) to run DataHub locally using [Docker](https://docker.com).
|
||||
Please follow the [DataHub Quickstart Guide](https://docs.datahub.com/docs/quickstart) to run DataHub locally using [Docker](https://docker.com).
|
||||
|
||||
## Development
|
||||
|
||||
If you're looking to build & modify datahub please take a look at our [Development Guide](https://datahubproject.io/docs/developers).
|
||||
If you're looking to build & modify datahub please take a look at our [Development Guide](https://docs.datahub.com/docs/developers).
|
||||
|
||||
<p align="center">
|
||||
<a href="https://demo.datahubproject.io/">
|
||||
<a href="https://demo.datahub.com/">
|
||||
<img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/entity.png"/>
|
||||
</a>
|
||||
</p>
|
||||
@ -102,11 +102,12 @@ If you're looking to build & modify datahub please take a look at our [Developme
|
||||
|
||||
- [datahub-project/datahub](https://github.com/datahub-project/datahub): This repository contains the complete source code for DataHub's metadata model, metadata services, integration connectors and the web application.
|
||||
- [acryldata/datahub-actions](https://github.com/acryldata/datahub-actions): DataHub Actions is a framework for responding to changes to your DataHub Metadata Graph in real time.
|
||||
- [acryldata/datahub-helm](https://github.com/acryldata/datahub-helm): Repository of helm charts for deploying DataHub on a Kubernetes cluster
|
||||
- [acryldata/meta-world](https://github.com/acryldata/meta-world): A repository to store recipes, custom sources, transformations and other things to make your DataHub experience magical
|
||||
- [dbt-impact-action](https://github.com/acryldata/dbt-impact-action) : This repository contains a github action for commenting on your PRs with a summary of the impact of changes within a dbt project
|
||||
- [datahub-tools](https://github.com/makenotion/datahub-tools) : Additional python tools to interact with the DataHub GraphQL endpoints, built by Notion
|
||||
- [business-glossary-sync-action](https://github.com/acryldata/business-glossary-sync-action) : This repository contains a github action that opens PRs to update your business glossary yaml file.
|
||||
- [acryldata/datahub-helm](https://github.com/acryldata/datahub-helm): Helm charts for deploying DataHub on a Kubernetes cluster
|
||||
- [acryldata/meta-world](https://github.com/acryldata/meta-world): A repository to store recipes, custom sources, transformations and other things to make your DataHub experience magical.
|
||||
- [dbt-impact-action](https://github.com/acryldata/dbt-impact-action): A github action for commenting on your PRs with a summary of the impact of changes within a dbt project.
|
||||
- [datahub-tools](https://github.com/makenotion/datahub-tools): Additional python tools to interact with the DataHub GraphQL endpoints, built by Notion.
|
||||
- [business-glossary-sync-action](https://github.com/acryldata/business-glossary-sync-action): A github action that opens PRs to update your business glossary yaml file.
|
||||
- [mcp-server-datahub](https://github.com/acryldata/mcp-server-datahub): A [Model Context Protocol](https://modelcontextprotocol.io/) server implementation for DataHub.
|
||||
|
||||
## Releases
|
||||
|
||||
@ -118,7 +119,7 @@ We welcome contributions from the community. Please refer to our [Contributing G
|
||||
|
||||
## Community
|
||||
|
||||
Join our [Slack workspace](https://pages.acryl.io/slack?utm_source=github&utm_medium=readme&utm_campaign=github_readme) for discussions and important announcements. You can also find out more about our upcoming [town hall meetings](docs/townhalls.md) and view past recordings.
|
||||
Join our [Slack workspace](https://datahub.com/slack?utm_source=github&utm_medium=readme&utm_campaign=github_readme) for discussions and important announcements. You can also find out more about our upcoming [town hall meetings](docs/townhalls.md) and view past recordings.
|
||||
|
||||
## Security
|
||||
|
||||
@ -173,11 +174,11 @@ Here are the companies that have officially adopted DataHub. Please feel free to
|
||||
|
||||
## Select Articles & Talks
|
||||
|
||||
- [DataHub Blog](https://blog.datahubproject.io/)
|
||||
- [DataHub Blog](https://medium.com/datahub-project/)
|
||||
- [DataHub YouTube Channel](https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w)
|
||||
- [Optum: Data Mesh via DataHub](https://opensource.optum.com/blog/2022/03/23/data-mesh-via-datahub)
|
||||
- [Saxo Bank: Enabling Data Discovery in Data Mesh](https://medium.com/datahub-project/enabling-data-discovery-in-a-data-mesh-the-saxo-journey-451b06969c8f)
|
||||
- [Bringing The Power Of The DataHub Real-Time Metadata Graph To Everyone At Acryl Data](https://www.dataengineeringpodcast.com/acryl-data-datahub-metadata-graph-episode-230/)
|
||||
- [Bringing The Power Of The DataHub Real-Time Metadata Graph To Everyone At DataHub](https://www.dataengineeringpodcast.com/acryl-data-datahub-metadata-graph-episode-230/)
|
||||
- [DataHub: Popular Metadata Architectures Explained](https://engineering.linkedin.com/blog/2020/datahub-popular-metadata-architectures-explained)
|
||||
- [Driving DataOps Culture with LinkedIn DataHub](https://www.youtube.com/watch?v=ccsIKK9nVxk) @ [DataOps Unleashed 2021](https://dataopsunleashed.com/#shirshanka-session)
|
||||
- [The evolution of metadata: LinkedIn’s story](https://speakerdeck.com/shirshanka/the-evolution-of-metadata-linkedins-journey-strata-nyc-2019) @ [Strata Data Conference 2019](https://conferences.oreilly.com/strata/strata-ny-2019.html)
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Reporting Security Issues
|
||||
|
||||
If you think you have found a security vulnerability, please send a report to security@datahubproject.io. This address can be used for all of Acryl Data’s open source and commercial products (including but not limited to DataHub and Acryl Data). We can accept only vulnerability reports at this address.
|
||||
If you think you have found a security vulnerability, please send a report to security@datahubproject.io. This address can be used for all of DataHub’s open source and commercial products (including but not limited to DataHub Core and DataHub Cloud). We can accept only vulnerability reports at this address.
|
||||
|
||||
It's not mandatory, but if you'd like to encrypt your message to us; please use our PGP key. The key fingerprint is:
|
||||
|
||||
@ -8,9 +8,9 @@ A50B10A86CC21F4B7BE102E170764C95B4FACEBF
|
||||
|
||||
The key is available from [keyserver.ubuntu.com](https://keyserver.ubuntu.com/pks/lookup?search=A50B10A86CC21F4B7BE102E170764C95B4FACEBF&fingerprint=on&op=index).
|
||||
|
||||
Acryl Data will send you a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
|
||||
DataHub will send you a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
|
||||
|
||||
**Important:** We ask you to not disclose the vulnerability before it have been fixed and announced, unless you received a response from the Acryl Data security team that you can do so.
|
||||
**Important:** We ask you to not disclose the vulnerability before it have been fixed and announced, unless you received a response from the DataHub security team that you can do so.
|
||||
|
||||
## Security announcements
|
||||
|
||||
|
154
build.gradle
154
build.gradle
@ -1,3 +1,6 @@
|
||||
import org.apache.tools.ant.filters.ReplaceTokens
|
||||
|
||||
|
||||
buildscript {
|
||||
ext.jdkVersionDefault = 17
|
||||
ext.javaClassVersionDefault = 11
|
||||
@ -32,36 +35,37 @@ buildscript {
|
||||
|
||||
ext.junitJupiterVersion = '5.6.1'
|
||||
// Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md
|
||||
ext.pegasusVersion = '29.57.0'
|
||||
ext.pegasusVersion = '29.65.7'
|
||||
ext.mavenVersion = '3.6.3'
|
||||
ext.versionGradle = '8.11.1'
|
||||
ext.springVersion = '6.1.14'
|
||||
ext.springBootVersion = '3.2.9'
|
||||
ext.springKafkaVersion = '3.1.6'
|
||||
ext.openTelemetryVersion = '1.45.0'
|
||||
ext.springVersion = '6.2.5'
|
||||
ext.springBootVersion = '3.4.5'
|
||||
ext.springKafkaVersion = '3.3.6'
|
||||
ext.openTelemetryVersion = '1.49.0'
|
||||
ext.neo4jVersion = '5.20.0'
|
||||
ext.neo4jTestVersion = '5.20.0'
|
||||
ext.neo4jApocVersion = '5.20.0'
|
||||
ext.testContainersVersion = '1.17.4'
|
||||
ext.testContainersVersion = '1.21.1'
|
||||
ext.elasticsearchVersion = '2.11.1' // ES 7.10, Opensearch 1.x, 2.x
|
||||
ext.jacksonVersion = '2.15.3'
|
||||
ext.jettyVersion = '12.0.16'
|
||||
ext.jacksonVersion = '2.18.4'
|
||||
ext.jettyVersion = '12.0.21'
|
||||
// see also datahub-frontend/play.gradle
|
||||
ext.playVersion = '2.8.22'
|
||||
ext.playScalaVersion = '2.13'
|
||||
ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license
|
||||
ext.log4jVersion = '2.23.1'
|
||||
ext.slf4jVersion = '1.7.36'
|
||||
ext.logbackClassic = '1.4.14'
|
||||
ext.logbackClassic = '1.5.18'
|
||||
ext.hadoop3Version = '3.3.6'
|
||||
ext.kafkaVersion = '5.5.15'
|
||||
ext.kafkaVersion = '8.0.0'
|
||||
ext.hazelcastVersion = '5.3.6'
|
||||
ext.ebeanVersion = '15.5.2'
|
||||
ext.googleJavaFormatVersion = '1.18.1'
|
||||
ext.openLineageVersion = '1.25.0'
|
||||
ext.logbackClassicJava8 = '1.2.12'
|
||||
ext.awsSdk2Version = '2.30.33'
|
||||
|
||||
ext.docker_registry = 'acryldata'
|
||||
ext.docker_registry = project.getProperties().getOrDefault("dockerRegistry", 'acryldata')
|
||||
|
||||
apply from: './repositories.gradle'
|
||||
buildscript.repositories.addAll(project.repositories)
|
||||
@ -81,13 +85,13 @@ plugins {
|
||||
id 'com.gorylenko.gradle-git-properties' version '2.4.1'
|
||||
id 'com.gradleup.shadow' version '8.3.5' apply false
|
||||
id 'com.palantir.docker' version '0.35.0' apply false
|
||||
id 'com.avast.gradle.docker-compose' version '0.17.6'
|
||||
id 'com.avast.gradle.docker-compose' version '0.17.12'
|
||||
id "com.diffplug.spotless" version "6.23.3"
|
||||
// https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/
|
||||
// TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0"
|
||||
}
|
||||
|
||||
apply from: "gradle/docker/docker.gradle"
|
||||
apply from: "gradle/docker/docker-utils.gradle"
|
||||
|
||||
project.ext.spec = [
|
||||
'product' : [
|
||||
@ -108,24 +112,26 @@ project.ext.spec = [
|
||||
|
||||
project.ext.externalDependency = [
|
||||
'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing
|
||||
'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:10.2.10", // akka-parsing is part of akka-http, so use akka http version
|
||||
'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion",
|
||||
'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion",
|
||||
'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion",
|
||||
'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion",
|
||||
'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion",
|
||||
'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion",
|
||||
'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion",
|
||||
'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3',
|
||||
'antlr4': 'org.antlr:antlr4:4.9.3',
|
||||
'assertJ': 'org.assertj:assertj-core:3.11.1',
|
||||
'avro': 'org.apache.avro:avro:1.11.4',
|
||||
'avroCompiler': 'org.apache.avro:avro-compiler:1.11.4',
|
||||
'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.17',
|
||||
'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:2.0.3',
|
||||
'awsS3': 'software.amazon.awssdk:s3:2.26.21',
|
||||
'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.13',
|
||||
'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:1.0.2',
|
||||
'awsRds':'software.amazon.awssdk:rds:2.18.24',
|
||||
'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.23',
|
||||
'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:2.3.0',
|
||||
'awsS3': "software.amazon.awssdk:s3:$awsSdk2Version",
|
||||
'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.15',
|
||||
'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:2.5.4',
|
||||
'awsRds':"software.amazon.awssdk:rds:$awsSdk2Version",
|
||||
'azureIdentityExtensions': 'com.azure:azure-identity-extensions:1.2.2',
|
||||
'azureIdentity': 'com.azure:azure-identity:1.15.4',
|
||||
'cacheApi': 'javax.cache:cache-api:1.1.0',
|
||||
'commonsCli': 'commons-cli:commons-cli:1.5.0',
|
||||
'commonsIo': 'commons-io:commons-io:2.17.0',
|
||||
@ -137,7 +143,8 @@ project.ext.externalDependency = [
|
||||
'datastaxOssCore': 'com.datastax.oss:java-driver-core:4.14.1',
|
||||
'datastaxOssQueryBuilder': 'com.datastax.oss:java-driver-query-builder:4.14.1',
|
||||
'dgraph4j' : 'io.dgraph:dgraph4j:24.1.1',
|
||||
'dgraphNetty': 'io.grpc:grpc-netty-shaded:1.69.0',
|
||||
'dgraphNetty': 'io.grpc:grpc-netty:1.71.0',
|
||||
'dgraphShadedNetty': 'io.grpc:grpc-netty-shaded:1.71.0',
|
||||
'dropwizardMetricsCore': 'io.dropwizard.metrics:metrics-core:4.2.3',
|
||||
'dropwizardMetricsJmx': 'io.dropwizard.metrics:metrics-jmx:4.2.3',
|
||||
'ebean': 'io.ebean:ebean:' + ebeanVersion,
|
||||
@ -146,11 +153,10 @@ project.ext.externalDependency = [
|
||||
'ebeanDdl': 'io.ebean:ebean-ddl-generator:' + ebeanVersion,
|
||||
'ebeanQueryBean': 'io.ebean:querybean-generator:' + ebeanVersion,
|
||||
'elasticSearchRest': 'org.opensearch.client:opensearch-rest-high-level-client:' + elasticsearchVersion,
|
||||
'elasticSearchJava': 'org.opensearch.client:opensearch-java:2.6.0',
|
||||
'findbugsAnnotations': 'com.google.code.findbugs:annotations:3.0.1',
|
||||
'graphqlJava': 'com.graphql-java:graphql-java:21.5',
|
||||
'graphqlJavaScalars': 'com.graphql-java:graphql-java-extended-scalars:21.0',
|
||||
'gson': 'com.google.code.gson:gson:2.8.9',
|
||||
'gson': 'com.google.code.gson:gson:2.12.0',
|
||||
'guice': 'com.google.inject:guice:7.0.0',
|
||||
'guicePlay': 'com.google.inject:guice:5.0.1', // Used for frontend while still on old Play version
|
||||
'guava': 'com.google.guava:guava:32.1.3-jre',
|
||||
@ -163,13 +169,17 @@ project.ext.externalDependency = [
|
||||
'hazelcastSpring':"com.hazelcast:hazelcast-spring:$hazelcastVersion",
|
||||
'hazelcastTest':"com.hazelcast:hazelcast:$hazelcastVersion:tests",
|
||||
'hibernateCore': 'org.hibernate:hibernate-core:5.2.16.Final',
|
||||
'httpClient': 'org.apache.httpcomponents.client5:httpclient5:5.3',
|
||||
'httpClient': 'org.apache.httpcomponents.client5:httpclient5:5.4.3',
|
||||
'iStackCommons': 'com.sun.istack:istack-commons-runtime:4.0.1',
|
||||
'jacksonJDK8': "com.fasterxml.jackson.datatype:jackson-datatype-jdk8:$jacksonVersion",
|
||||
'jacksonDataPropertyFormat': "com.fasterxml.jackson.dataformat:jackson-dataformat-properties:$jacksonVersion",
|
||||
'jacksonCore': "com.fasterxml.jackson.core:jackson-core:$jacksonVersion",
|
||||
'jacksonDataBind': "com.fasterxml.jackson.core:jackson-databind:$jacksonVersion",
|
||||
'jacksonDataFormatYaml': "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:$jacksonVersion",
|
||||
// The jacksonBom controls the version of other jackson modules, pin the version once
|
||||
// implementation platform(externalDependency.jacksonBom)
|
||||
'jacksonBom': "com.fasterxml.jackson:jackson-bom:$jacksonVersion",
|
||||
'jacksonJDK8': 'com.fasterxml.jackson.datatype:jackson-datatype-jdk8',
|
||||
'jacksonDataPropertyFormat': 'com.fasterxml.jackson.dataformat:jackson-dataformat-properties',
|
||||
'jacksonCore': 'com.fasterxml.jackson.core:jackson-core',
|
||||
'jacksonDataBind': 'com.fasterxml.jackson.core:jackson-databind',
|
||||
'jacksonJsr310': 'com.fasterxml.jackson.datatype:jackson-datatype-jsr310',
|
||||
'jacksonDataFormatYaml': 'com.fasterxml.jackson.dataformat:jackson-dataformat-yaml',
|
||||
'woodstoxCore': 'com.fasterxml.woodstox:woodstox-core:6.4.0',
|
||||
'javatuples': 'org.javatuples:javatuples:1.2',
|
||||
'javaxInject' : 'javax.inject:javax.inject:1',
|
||||
@ -197,7 +207,7 @@ project.ext.externalDependency = [
|
||||
'kafkaAvroSerde': "io.confluent:kafka-streams-avro-serde:$kafkaVersion",
|
||||
'kafkaAvroSerializer': "io.confluent:kafka-avro-serializer:$kafkaVersion",
|
||||
'kafkaClients': "org.apache.kafka:kafka-clients:$kafkaVersion-ccs",
|
||||
'snappy': 'org.xerial.snappy:snappy-java:1.1.10.5',
|
||||
'snappy': 'org.xerial.snappy:snappy-java:1.1.10.7',
|
||||
'logbackClassic': "ch.qos.logback:logback-classic:$logbackClassic",
|
||||
'logbackClassicJava8' : "ch.qos.logback:logback-classic:$logbackClassicJava8",
|
||||
'slf4jApi': "org.slf4j:slf4j-api:$slf4jVersion",
|
||||
@ -222,10 +232,14 @@ project.ext.externalDependency = [
|
||||
'opentelemetryApi': 'io.opentelemetry:opentelemetry-api:' + openTelemetryVersion,
|
||||
'opentelemetrySdk': 'io.opentelemetry:opentelemetry-sdk:' + openTelemetryVersion,
|
||||
'opentelemetrySdkTrace': 'io.opentelemetry:opentelemetry-sdk-trace:' + openTelemetryVersion,
|
||||
'opentelemetrySdkMetrics': 'io.opentelemetry:opentelemetry-sdk-metrics:' + openTelemetryVersion,
|
||||
'opentelemetryAutoConfig': 'io.opentelemetry:opentelemetry-sdk-extension-autoconfigure:' + openTelemetryVersion,
|
||||
'opentelemetryAnnotations': 'io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations:2.11.0',
|
||||
'opentelemetryExporter': 'io.opentelemetry:opentelemetry-exporter-otlp:' + openTelemetryVersion,
|
||||
'openTelemetryExporterLogging': 'io.opentelemetry:opentelemetry-exporter-logging:' + openTelemetryVersion,
|
||||
'openTelemetryExporterCommon': 'io.opentelemetry:opentelemetry-exporter-otlp-common:' + openTelemetryVersion,
|
||||
'opentelemetryAnnotations': 'io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations:2.15.0',
|
||||
'opentracingJdbc':'io.opentracing.contrib:opentracing-jdbc:0.2.15',
|
||||
'parquet': 'org.apache.parquet:parquet-avro:1.12.3',
|
||||
'parquet': 'org.apache.parquet:parquet-avro:1.15.2',
|
||||
'parquetHadoop': 'org.apache.parquet:parquet-hadoop:1.13.1',
|
||||
'picocli': 'info.picocli:picocli:4.5.0',
|
||||
'playCache': "com.typesafe.play:play-cache_$playScalaVersion:$playVersion",
|
||||
@ -240,11 +254,11 @@ project.ext.externalDependency = [
|
||||
'playFilters': "com.typesafe.play:filters-helpers_$playScalaVersion:$playVersion",
|
||||
'pac4j': 'org.pac4j:pac4j-oidc:6.0.6',
|
||||
'playPac4j': "org.pac4j:play-pac4j_$playScalaVersion:12.0.0-PLAY2.8",
|
||||
'postgresql': 'org.postgresql:postgresql:42.7.4',
|
||||
'postgresql': 'org.postgresql:postgresql:42.7.7',
|
||||
'protobuf': 'com.google.protobuf:protobuf-java:3.25.5',
|
||||
'grpcProtobuf': 'io.grpc:grpc-protobuf:1.53.0',
|
||||
'rangerCommons': 'org.apache.ranger:ranger-plugins-common:2.3.0',
|
||||
'reflections': 'org.reflections:reflections:0.9.9',
|
||||
'reflections': 'org.reflections:reflections:0.9.12',
|
||||
'resilience4j': 'io.github.resilience4j:resilience4j-retry:1.7.1',
|
||||
'rythmEngine': 'org.rythmengine:rythm-engine:1.3.0',
|
||||
'servletApi': 'jakarta.servlet:jakarta.servlet-api:6.0.0',
|
||||
@ -255,7 +269,7 @@ project.ext.externalDependency = [
|
||||
'springBeans': "org.springframework:spring-beans:$springVersion",
|
||||
'springContext': "org.springframework:spring-context:$springVersion",
|
||||
'springCore': "org.springframework:spring-core:$springVersion",
|
||||
'springDocUI': 'org.springdoc:springdoc-openapi-starter-webmvc-ui:2.3.0',
|
||||
'springDocUI': 'org.springdoc:springdoc-openapi-starter-webmvc-ui:2.8.9',
|
||||
'springJdbc': "org.springframework:spring-jdbc:$springVersion",
|
||||
'springWeb': "org.springframework:spring-web:$springVersion",
|
||||
'springWebMVC': "org.springframework:spring-webmvc:$springVersion",
|
||||
@ -268,10 +282,11 @@ project.ext.externalDependency = [
|
||||
'springBootStarterValidation': "org.springframework.boot:spring-boot-starter-validation:$springBootVersion",
|
||||
'springKafka': "org.springframework.kafka:spring-kafka:$springKafkaVersion",
|
||||
'springActuator': "org.springframework.boot:spring-boot-starter-actuator:$springBootVersion",
|
||||
'springRetry': "org.springframework.retry:spring-retry:2.0.6",
|
||||
'swaggerAnnotations': 'io.swagger.core.v3:swagger-annotations:2.2.15',
|
||||
'springRetry': "org.springframework.retry:spring-retry:2.0.11",
|
||||
'swaggerAnnotations': 'io.swagger.core.v3:swagger-annotations:2.2.30',
|
||||
'swaggerCli': 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.46',
|
||||
'swaggerCore': 'io.swagger.core.v3:swagger-core:2.2.7',
|
||||
'swaggerCore': 'io.swagger.core.v3:swagger-core:2.2.30',
|
||||
'swaggerParser': 'io.swagger.parser.v3:swagger-parser:2.1.27',
|
||||
'springBootAutoconfigureJdk11': 'org.springframework.boot:spring-boot-autoconfigure:2.7.18',
|
||||
'testng': 'org.testng:testng:7.8.0',
|
||||
'testContainers': 'org.testcontainers:testcontainers:' + testContainersVersion,
|
||||
@ -280,7 +295,7 @@ project.ext.externalDependency = [
|
||||
'testContainersElasticsearch': 'org.testcontainers:elasticsearch:' + testContainersVersion,
|
||||
'testContainersCassandra': 'org.testcontainers:cassandra:' + testContainersVersion,
|
||||
'testContainersKafka': 'org.testcontainers:kafka:' + testContainersVersion,
|
||||
'testContainersOpenSearch': 'org.opensearch:opensearch-testcontainers:2.0.0',
|
||||
'testContainersOpenSearch': 'org.opensearch:opensearch-testcontainers:2.1.3',
|
||||
'typesafeConfig':'com.typesafe:config:1.4.1',
|
||||
'wiremock':'com.github.tomakehurst:wiremock:2.10.0',
|
||||
'zookeeper': 'org.apache.zookeeper:zookeeper:3.8.4',
|
||||
@ -381,6 +396,12 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
|
||||
exclude group: "org.slf4j", module: "slf4j-ext"
|
||||
exclude group: "org.codehaus.jackson", module: "jackson-mapper-asl"
|
||||
exclude group: "javax.mail", module: "mail"
|
||||
exclude group: 'org.glassfish', module: 'javax.json'
|
||||
exclude group: 'org.glassfish', module: 'jakarta.json'
|
||||
|
||||
// Tomcat excluded for jetty
|
||||
exclude group: 'org.apache.tomcat.embed', module: 'tomcat-embed-el'
|
||||
exclude group: 'org.springframework.boot', module: 'spring-boot-starter-tomcat'
|
||||
|
||||
resolutionStrategy.force externalDependency.antlr4Runtime
|
||||
resolutionStrategy.force externalDependency.antlr4
|
||||
@ -395,25 +416,56 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
|
||||
}
|
||||
}
|
||||
|
||||
subprojects {
|
||||
|
||||
apply plugin: 'maven-publish'
|
||||
apply plugin: 'com.gorylenko.gradle-git-properties'
|
||||
apply plugin: 'com.diffplug.spotless'
|
||||
|
||||
gitProperties {
|
||||
keys = ['git.commit.id','git.commit.id.describe','git.commit.time']
|
||||
// using any tags (not limited to annotated tags) for "git.commit.id.describe" property
|
||||
// see http://ajoberstar.org/grgit/grgit-describe.html for more info about the describe method and available parameters
|
||||
// 'it' is an instance of org.ajoberstar.grgit.Grgit
|
||||
customProperty 'git.commit.id.describe', { it.describe(tags: true) }
|
||||
gitPropertiesResourceDir = rootProject.buildDir
|
||||
failOnNoGitDirectory = false
|
||||
}
|
||||
|
||||
def gitPropertiesGenerated = false
|
||||
|
||||
apply from: 'gradle/versioning/versioning-global.gradle'
|
||||
|
||||
tasks.register("generateGitPropertiesGlobal", com.gorylenko.GenerateGitPropertiesTask) {
|
||||
doFirst {
|
||||
if (!gitPropertiesGenerated) {
|
||||
println "Generating git.properties"
|
||||
gitPropertiesGenerated = true
|
||||
} else {
|
||||
// Skip actual execution if already run
|
||||
onlyIf { false }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
subprojects {
|
||||
|
||||
apply plugin: 'maven-publish'
|
||||
apply plugin: 'com.diffplug.spotless'
|
||||
|
||||
def gitPropertiesTask = tasks.register("copyGitProperties", Copy) {
|
||||
dependsOn rootProject.tasks.named("generateGitPropertiesGlobal")
|
||||
def sourceFile = file("${rootProject.buildDir}/git.properties")
|
||||
from sourceFile
|
||||
into "$project.buildDir/resources/main"
|
||||
}
|
||||
|
||||
plugins.withType(JavaPlugin).configureEach {
|
||||
project.tasks.named(JavaPlugin.CLASSES_TASK_NAME).configure{
|
||||
dependsOn gitPropertiesTask
|
||||
}
|
||||
if (project.name == 'datahub-web-react') {
|
||||
return
|
||||
}
|
||||
/* TODO: evaluate ignoring jar timestamps for increased caching (compares checksum instead)
|
||||
jar {
|
||||
preserveFileTimestamps = false
|
||||
}*/
|
||||
|
||||
dependencies {
|
||||
implementation externalDependency.annotationApi
|
||||
@ -517,3 +569,17 @@ wrapper {
|
||||
gradleVersion = project.versionGradle
|
||||
distributionType = Wrapper.DistributionType.ALL
|
||||
}
|
||||
|
||||
tasks.register('format') {
|
||||
dependsOn(':datahub-web-react:graphqlPrettierWrite')
|
||||
dependsOn(':datahub-web-react:githubActionsPrettierWrite')
|
||||
dependsOn(':datahub-web-react:mdPrettierWrite')
|
||||
dependsOn('spotlessApply')
|
||||
}
|
||||
|
||||
tasks.register('formatChanged') {
|
||||
dependsOn(':datahub-web-react:graphqlPrettierWriteChanged')
|
||||
dependsOn(':datahub-web-react:githubActionsPrettierWriteChanged')
|
||||
dependsOn(':datahub-web-react:mdPrettierWriteChanged')
|
||||
dependsOn('spotlessApply')
|
||||
}
|
||||
|
24
datahub-actions/.gitignore
vendored
Normal file
24
datahub-actions/.gitignore
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
venv*/
|
||||
.coverage
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
junit.*xml
|
242
datahub-actions/README.md
Normal file
242
datahub-actions/README.md
Normal file
@ -0,0 +1,242 @@
|
||||
# ⚡ DataHub Actions Framework
|
||||
|
||||
Welcome to DataHub Actions! The Actions framework makes responding to realtime changes in your Metadata Graph easy, enabling you to seamlessly integrate [DataHub](https://github.com/datahub-project/datahub) into a broader events-based architecture.
|
||||
|
||||
For a detailed introduction, check out the [original announcement](https://www.youtube.com/watch?v=7iwNxHgqxtg&t=2189s) of the DataHub Actions Framework at the DataHub April 2022 Town Hall. For a more in-depth look at use cases and concepts, check out [DataHub Actions Concepts](../docs/actions/concepts.md).
|
||||
|
||||
## Quickstart
|
||||
|
||||
To get started right away, check out the [DataHub Actions Quickstart](../docs/actions/quickstart.md) Guide.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
The DataHub Actions CLI commands are an extension of the base `datahub` CLI commands. We recommend
|
||||
first installing the `datahub` CLI:
|
||||
|
||||
```shell
|
||||
python3 -m pip install --upgrade pip wheel setuptools
|
||||
python3 -m pip install --upgrade acryl-datahub
|
||||
datahub --version
|
||||
```
|
||||
|
||||
> Note that the Actions Framework requires a version of `acryl-datahub` >= v0.8.34
|
||||
|
||||
## Installation
|
||||
|
||||
Next, simply install the `acryl-datahub-actions` package from PyPi:
|
||||
|
||||
```shell
|
||||
python3 -m pip install --upgrade pip wheel setuptools
|
||||
python3 -m pip install --upgrade acryl-datahub-actions
|
||||
datahub actions version
|
||||
```
|
||||
|
||||
## Configuring an Action
|
||||
|
||||
Actions are configured using a YAML file, much in the same way DataHub ingestion sources are. An action configuration file consists of the following
|
||||
|
||||
1. Action Pipeline Name (Should be unique and static)
|
||||
2. Source Configurations
|
||||
3. Transform + Filter Configurations
|
||||
4. Action Configuration
|
||||
5. Pipeline Options (Optional)
|
||||
6. DataHub API configs (Optional - required for select actions)
|
||||
|
||||
With each component being independently pluggable and configurable.
|
||||
|
||||
```yml
|
||||
# 1. Required: Action Pipeline Name
|
||||
name: <action-pipeline-name>
|
||||
|
||||
# 2. Required: Event Source - Where to source event from.
|
||||
source:
|
||||
type: <source-type>
|
||||
config:
|
||||
# Event Source specific configs (map)
|
||||
|
||||
# 3a. Optional: Filter to run on events (map)
|
||||
filter:
|
||||
event_type: <filtered-event-type>
|
||||
event:
|
||||
# Filter event fields by exact-match
|
||||
<filtered-event-fields>
|
||||
|
||||
# 3b. Optional: Custom Transformers to run on events (array)
|
||||
transform:
|
||||
- type: <transformer-type>
|
||||
config:
|
||||
# Transformer-specific configs (map)
|
||||
|
||||
# 4. Required: Action - What action to take on events.
|
||||
action:
|
||||
type: <action-type>
|
||||
config:
|
||||
# Action-specific configs (map)
|
||||
|
||||
# 5. Optional: Additional pipeline options (error handling, etc)
|
||||
options:
|
||||
retry_count: 0 # The number of times to retry an Action with the same event. (If an exception is thrown). 0 by default.
|
||||
failure_mode: "CONTINUE" # What to do when an event fails to be processed. Either 'CONTINUE' to make progress or 'THROW' to stop the pipeline. Either way, the failed event will be logged to a failed_events.log file.
|
||||
failed_events_dir: "/tmp/datahub/actions" # The directory in which to write a failed_events.log file that tracks events which fail to be processed. Defaults to "/tmp/logs/datahub/actions".
|
||||
|
||||
# 6. Optional: DataHub API configuration
|
||||
datahub:
|
||||
server: "http://localhost:8080" # Location of DataHub API
|
||||
# token: <your-access-token> # Required if Metadata Service Auth enabled
|
||||
```
|
||||
|
||||
### Example: Hello World
|
||||
|
||||
An simple configuration file for a "Hello World" action, which simply prints all events it receives, is
|
||||
|
||||
```yml
|
||||
# 1. Action Pipeline Name
|
||||
name: "hello_world"
|
||||
# 2. Event Source: Where to source event from.
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
# 3. Action: What action to take on events.
|
||||
action:
|
||||
type: "hello_world"
|
||||
```
|
||||
|
||||
We can modify this configuration further to filter for specific events, by adding a "filter" block.
|
||||
|
||||
```yml
|
||||
# 1. Action Pipeline Name
|
||||
name: "hello_world"
|
||||
|
||||
# 2. Event Source - Where to source event from.
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
|
||||
# 3. Filter - Filter events that reach the Action
|
||||
filter:
|
||||
event_type: "EntityChangeEvent_v1"
|
||||
event:
|
||||
category: "TAG"
|
||||
operation: "ADD"
|
||||
modifier: "urn:li:tag:pii"
|
||||
|
||||
# 4. Action - What action to take on events.
|
||||
action:
|
||||
type: "hello_world"
|
||||
```
|
||||
|
||||
## Running an Action
|
||||
|
||||
To run a new Action, just use the `actions` CLI command
|
||||
|
||||
```
|
||||
datahub actions -c <config.yml>
|
||||
```
|
||||
|
||||
Once the Action is running, you will see
|
||||
|
||||
```
|
||||
Action Pipeline with name '<action-pipeline-name>' is now running.
|
||||
```
|
||||
|
||||
### Running multiple Actions
|
||||
|
||||
You can run multiple actions pipeline within the same command. Simply provide multiple
|
||||
config files by restating the "-c" command line argument.
|
||||
|
||||
For example,
|
||||
|
||||
```
|
||||
datahub actions -c <config-1.yaml> -c <config-2.yaml>
|
||||
```
|
||||
|
||||
### Running in debug mode
|
||||
|
||||
Simply append the `--debug` flag to the CLI to run your action in debug mode.
|
||||
|
||||
```
|
||||
datahub actions -c <config.yaml> --debug
|
||||
```
|
||||
|
||||
### Stopping an Action
|
||||
|
||||
Just issue a Control-C as usual. You should see the Actions Pipeline shut down gracefully, with a small
|
||||
summary of processing results.
|
||||
|
||||
```
|
||||
Actions Pipeline with name '<action-pipeline-name' has been stopped.
|
||||
```
|
||||
|
||||
## Supported Events
|
||||
|
||||
Two event types are currently supported. Read more about them below.
|
||||
|
||||
- [Entity Change Event V1](../docs/actions/events/entity-change-event.md)
|
||||
- [Metadata Change Log V1](../docs/actions/events/metadata-change-log-event.md)
|
||||
|
||||
## Supported Event Sources
|
||||
|
||||
Currently, the only event source that is officially supported is `kafka`, which polls for events
|
||||
via a Kafka Consumer.
|
||||
|
||||
- [Kafka Event Source](../docs/actions/sources/kafka-event-source.md)
|
||||
|
||||
## Supported Actions
|
||||
|
||||
By default, DataHub supports a set of standard actions plugins. These can be found inside the folder
|
||||
`src/datahub-actions/plugins`.
|
||||
|
||||
Some pre-included Actions include
|
||||
|
||||
- [Hello World](../docs/actions/actions/hello_world.md)
|
||||
- [Executor](../docs/actions/actions/executor.md)
|
||||
|
||||
## Development
|
||||
|
||||
### Build and Test
|
||||
|
||||
Notice that we support all actions command using a separate `datahub-actions` CLI entry point. Feel free
|
||||
to use this during development.
|
||||
|
||||
```
|
||||
# Build datahub-actions module
|
||||
./gradlew datahub-actions:build
|
||||
|
||||
# Drop into virtual env
|
||||
cd datahub-actions && source venv/bin/activate
|
||||
|
||||
# Start hello world action
|
||||
datahub-actions actions -c ../examples/hello_world.yaml
|
||||
|
||||
# Start ingestion executor action
|
||||
datahub-actions actions -c ../examples/executor.yaml
|
||||
|
||||
# Start multiple actions
|
||||
datahub-actions actions -c ../examples/executor.yaml -c ../examples/hello_world.yaml
|
||||
```
|
||||
|
||||
### Developing a Transformer
|
||||
|
||||
To develop a new Transformer, check out the [Developing a Transformer](../docs/actions/guides/developing-a-transformer.md) guide.
|
||||
|
||||
### Developing an Action
|
||||
|
||||
To develop a new Action, check out the [Developing an Action](../docs/actions/guides/developing-an-action.md) guide.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributing guidelines follow those of the [main DataHub project](../docs/CONTRIBUTING.md). We are accepting contributions for Actions, Transformers, and general framework improvements (tests, error handling, etc).
|
||||
|
||||
## Resources
|
||||
|
||||
Check out the [original announcement](https://www.youtube.com/watch?v=7iwNxHgqxtg&t=2189s) of the DataHub Actions Framework at the DataHub April 2022 Town Hall.
|
||||
|
||||
## License
|
||||
|
||||
[Apache 2.0](./LICENSE)
|
162
datahub-actions/build.gradle
Normal file
162
datahub-actions/build.gradle
Normal file
@ -0,0 +1,162 @@
|
||||
/**
|
||||
* Copyright 2021 Acryl Data, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
plugins {
|
||||
id 'scala'
|
||||
id 'org.gradle.playframework'
|
||||
}
|
||||
|
||||
apply from: "../gradle/versioning/versioning.gradle"
|
||||
apply from: "../gradle/coverage/python-coverage.gradle"
|
||||
apply from: '../gradle/docker/docker.gradle'
|
||||
|
||||
ext {
|
||||
python_executable = 'python3'
|
||||
venv_name = 'venv'
|
||||
docker_registry = 'acryldata'
|
||||
docker_repo = 'datahub-actions'
|
||||
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||
|
||||
python_docker_version = project.getProperties().getOrDefault("pythonDockerVersion", "1!0.0.0+docker.${version}")
|
||||
}
|
||||
|
||||
if (!project.hasProperty("extra_pip_requirements")) {
|
||||
ext.extra_pip_requirements = ""
|
||||
}
|
||||
|
||||
def pip_install_command = "VIRTUAL_ENV=${venv_name} ${venv_name}/bin/uv pip install -e ../metadata-ingestion"
|
||||
|
||||
task checkPythonVersion(type: Exec) {
|
||||
commandLine python_executable, '-c',
|
||||
'import sys; sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"'
|
||||
}
|
||||
|
||||
task environmentSetup(type: Exec, dependsOn: checkPythonVersion) {
|
||||
def sentinel_file = "${venv_name}/.venv_environment_sentinel"
|
||||
inputs.file file('setup.py')
|
||||
outputs.file(sentinel_file)
|
||||
commandLine 'bash', '-c',
|
||||
"${python_executable} -m venv ${venv_name} && " +
|
||||
"${venv_name}/bin/python -m pip install --upgrade uv && " +
|
||||
"touch ${sentinel_file}"
|
||||
}
|
||||
|
||||
task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingestion:codegen']) {
|
||||
def sentinel_file = "${venv_name}/.build_install_package_sentinel"
|
||||
inputs.file file('setup.py')
|
||||
outputs.file(sentinel_file)
|
||||
commandLine 'bash', '-c',
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"${pip_install_command} -e . ${extra_pip_requirements} && " +
|
||||
"touch ${sentinel_file}"
|
||||
}
|
||||
|
||||
task install(dependsOn: [installPackage])
|
||||
|
||||
task installDev(type: Exec, dependsOn: [install]) {
|
||||
def sentinel_file = "${venv_name}/.build_install_dev_sentinel"
|
||||
inputs.file file('setup.py')
|
||||
outputs.file(sentinel_file)
|
||||
commandLine 'bash', '-c',
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"${pip_install_command} -e .[dev] ${extra_pip_requirements} && " +
|
||||
"touch ${sentinel_file}"
|
||||
}
|
||||
|
||||
task lint(type: Exec, dependsOn: installDev) {
|
||||
commandLine 'bash', '-c',
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"ruff check src/ tests/ && " +
|
||||
"ruff format --check src/ tests/ && " +
|
||||
"mypy --show-traceback --show-error-codes src/ tests/"
|
||||
}
|
||||
|
||||
task lintFix(type: Exec, dependsOn: installDev) {
|
||||
commandLine 'bash', '-c',
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"ruff check --fix src/ tests/ && " +
|
||||
"ruff format src/ tests/ "
|
||||
}
|
||||
|
||||
task installDevTest(type: Exec, dependsOn: [installDev]) {
|
||||
def sentinel_file = "${venv_name}/.build_install_dev_test_sentinel"
|
||||
inputs.file file('setup.py')
|
||||
outputs.dir("${venv_name}")
|
||||
outputs.file(sentinel_file)
|
||||
commandLine 'bash', '-c',
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"${pip_install_command} -e .[dev,integration-tests] ${extra_pip_requirements} && " +
|
||||
"touch ${sentinel_file}"
|
||||
}
|
||||
|
||||
task testFull(type: Exec, dependsOn: installDevTest) {
|
||||
inputs.files(project.fileTree(dir: "src/", include: "**/*.py"))
|
||||
inputs.files(project.fileTree(dir: "tests/"))
|
||||
outputs.dir("${venv_name}")
|
||||
commandLine 'bash', '-c',
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"pytest -vv ${get_coverage_args('full')} --continue-on-collection-errors --junit-xml=junit.full.xml"
|
||||
}
|
||||
|
||||
task buildWheel(type: Exec, dependsOn: [environmentSetup]) {
|
||||
commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " +
|
||||
'uv pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_INSTALL=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh'
|
||||
}
|
||||
|
||||
|
||||
task cleanPythonCache(type: Exec) {
|
||||
commandLine 'bash', '-x', '-c',
|
||||
"find src -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete"
|
||||
}
|
||||
|
||||
docker {
|
||||
dependsOn ':metadata-ingestion:codegen'
|
||||
name "${docker_registry}/${docker_repo}:${versionTag}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/datahub-actions/Dockerfile")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
exclude "datahub-actions/scripts/**"
|
||||
exclude "datahub-actions/build/**"
|
||||
exclude "datahub-actions/venv/**"
|
||||
exclude "datahub-actions/tests/**"
|
||||
exclude "**/*.xml"
|
||||
include ".dockerignore"
|
||||
include "docker/datahub-actions/**"
|
||||
include "docker/snippets/**"
|
||||
include "metadata-ingestion/**"
|
||||
include "datahub-actions/**"
|
||||
include "python-build/**"
|
||||
}.exclude {
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
|
||||
additionalTag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
defaultVariant = "slim"
|
||||
variants = [
|
||||
"slim": [suffix: "-slim", args: [APP_ENV: "slim", RELEASE_VERSION: python_docker_version]],
|
||||
"full": [suffix: "", args: [APP_ENV: "full", RELEASE_VERSION: python_docker_version]]
|
||||
]
|
||||
}
|
||||
|
||||
build.dependsOn install
|
||||
check.dependsOn lint
|
||||
check.dependsOn testFull
|
||||
|
||||
clean {
|
||||
delete venv_name
|
||||
delete 'build'
|
||||
delete 'dist'
|
||||
}
|
||||
clean.dependsOn cleanPythonCache
|
19
datahub-actions/examples/executor.yaml
Normal file
19
datahub-actions/examples/executor.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "ingestion_executor"
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
topic_routes:
|
||||
mcl: ${METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME:-MetadataChangeLog_Versioned_v1}
|
||||
filter:
|
||||
event_type: "MetadataChangeLogEvent_v1"
|
||||
event:
|
||||
entityType: "dataHubExecutionRequest"
|
||||
changeType: "UPSERT"
|
||||
action:
|
||||
type: "executor"
|
||||
datahub:
|
||||
server: "${DATAHUB_GMS_PROTOCOL:-http}://${DATAHUB_GMS_HOST:-localhost}:${DATAHUB_GMS_PORT:-8080}"
|
||||
# token: <your-access-token # Requires 'Manage Secrets' platform privilege.
|
12
datahub-actions/examples/hello_world.yaml
Normal file
12
datahub-actions/examples/hello_world.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
# hello_world.yaml
|
||||
name: "hello_world"
|
||||
# 1. Event Source: Where to source event from.
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
# 2. Action: What action to take on events.
|
||||
action:
|
||||
type: "hello_world"
|
13
datahub-actions/examples/hello_world_datahub_cloud.yaml
Normal file
13
datahub-actions/examples/hello_world_datahub_cloud.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
# hello_world.yaml
|
||||
name: "hello_world_datahub_cloud"
|
||||
# 1. DataHub Cloud Connection: Configure how to talk to DataHub Cloud
|
||||
datahub:
|
||||
server: "https://<your-organization>.acryl.io"
|
||||
token: "<your-datahub-cloud-token>"
|
||||
# 2. Event Source: Where to source event from.
|
||||
source:
|
||||
type: "datahub-cloud"
|
||||
# 3. Action: What action to take on events.
|
||||
# To learn how to develop a custom Action, see https://docs.datahub.com/docs/actions/guides/developing-an-action
|
||||
action:
|
||||
type: "hello_world"
|
28
datahub-actions/examples/metadata_change_sync.yaml
Normal file
28
datahub-actions/examples/metadata_change_sync.yaml
Normal file
@ -0,0 +1,28 @@
|
||||
name: "metadata_change_sync"
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
filter:
|
||||
event_type: "MetadataChangeLogEvent_v1"
|
||||
event:
|
||||
changeType: "UPSERT"
|
||||
action:
|
||||
type: "metadata_change_sync"
|
||||
config:
|
||||
gms_server: ${DEST_DATAHUB_GMS_URL}
|
||||
# If you have METADATA_SERVICE_AUTH_ENABLED enabled in GMS, you'll need to configure the auth token here
|
||||
gms_auth_token: ${DEST_DATAHUB_GMS_TOKEN}
|
||||
# you can provide a list of aspects you would like to exclude
|
||||
# By default, we are excluding these aspects:
|
||||
# dataHubAccessTokenInfo, dataHubAccessTokenKey, dataHubSecretKey, dataHubSecretValue, dataHubExecutionRequestInput
|
||||
# dataHubExecutionRequestKey, dataHubExecutionRequestResult
|
||||
aspects_to_exclude: []
|
||||
aspects_to_include: ['schemaMetadata','editableSchemaMetadata','ownership','domain']
|
||||
# you can provide extra headers in the request in key value format
|
||||
extra_headers: {}
|
||||
# you can provide a regex pattern for URNs to include
|
||||
# By default, we are including all URNs
|
||||
urn_regex: ".*"
|
29
datahub-actions/examples/snowflake_tag_propagation.yaml
Normal file
29
datahub-actions/examples/snowflake_tag_propagation.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
name: "snowflake_tag_propagation"
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
filter:
|
||||
event_type: "EntityChangeEvent_v1"
|
||||
action:
|
||||
type: "snowflake_tag_propagation"
|
||||
config:
|
||||
tag_propagation:
|
||||
tag_prefixes:
|
||||
- classification
|
||||
term_propagation:
|
||||
target_terms:
|
||||
- Classification
|
||||
term_groups:
|
||||
- "Personal Information"
|
||||
snowflake:
|
||||
account_id: ${SNOWFLAKE_ACCOUNT_ID}
|
||||
warehouse: COMPUTE_WH
|
||||
username: ${SNOWFLAKE_USER_NAME}
|
||||
password: ${SNOWFLAKE_PASSWORD}
|
||||
role: ACCOUNTADMIN
|
||||
|
||||
datahub:
|
||||
server: "http://localhost:8080"
|
60
datahub-actions/pyproject.toml
Normal file
60
datahub-actions/pyproject.toml
Normal file
@ -0,0 +1,60 @@
|
||||
[build-system]
|
||||
build-backend = "setuptools.build_meta"
|
||||
requires = ["setuptools>65.5.1", "wheel>0.38.1", "pip>=21.0.0"]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
target-version = "py38"
|
||||
exclude = [
|
||||
".git",
|
||||
"venv",
|
||||
".tox",
|
||||
"__pycache__",
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "double"
|
||||
indent-style = "space"
|
||||
skip-magic-trailing-comma = false
|
||||
line-ending = "auto"
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
combine-as-imports = true
|
||||
known-first-party = ["datahub"]
|
||||
extra-standard-library = ["__future__"]
|
||||
section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"]
|
||||
force-sort-within-sections = false
|
||||
force-wrap-aliases = false
|
||||
split-on-trailing-comma = false
|
||||
order-by-type = true
|
||||
relative-imports-order = "closest-to-furthest"
|
||||
force-single-line = false
|
||||
single-line-exclusions = ["typing"]
|
||||
length-sort = false
|
||||
from-first = false
|
||||
required-imports = []
|
||||
classes = ["typing"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
extend-select = [
|
||||
"B", # flake8-bugbear
|
||||
"C90", # mccabe complexity
|
||||
"E", # pycodestyle errors
|
||||
"F", # pyflakes
|
||||
"G010", # logging.warn -> logging.warning
|
||||
"I", # isort
|
||||
"TID", # flake8-tidy-imports
|
||||
"RUF100", # unused-noqa
|
||||
]
|
||||
ignore = [
|
||||
"E501", # Line length violations (handled by formatter)
|
||||
]
|
||||
|
||||
[tool.ruff.lint.mccabe]
|
||||
max-complexity = 15
|
||||
|
||||
[tool.ruff.lint.flake8-tidy-imports]
|
||||
ban-relative-imports = "all"
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"__init__.py" = ["F401"]
|
31
datahub-actions/scripts/release.sh
Executable file
31
datahub-actions/scripts/release.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
# Auto-generated by python-build/generate_release_scripts.py. Do not edit manually.
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
ROOT=..
|
||||
MODULE=datahub_actions
|
||||
|
||||
if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then
|
||||
${ROOT}/gradlew build # also runs tests
|
||||
elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then
|
||||
${ROOT}/gradlew install
|
||||
fi
|
||||
|
||||
# Check packaging constraint.
|
||||
python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"'
|
||||
|
||||
# Update the release version.
|
||||
if [[ ! ${RELEASE_VERSION:-} ]]; then
|
||||
echo "RELEASE_VERSION is not set"
|
||||
exit 1
|
||||
fi
|
||||
sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/_version.py
|
||||
|
||||
# Build and upload the release.
|
||||
rm -rf build dist || true
|
||||
python -m build
|
||||
if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then
|
||||
python -m twine upload 'dist/*'
|
||||
fi
|
||||
mv src/${MODULE}/_version.py.bak src/${MODULE}/_version.py
|
49
datahub-actions/setup.cfg
Normal file
49
datahub-actions/setup.cfg
Normal file
@ -0,0 +1,49 @@
|
||||
[mypy]
|
||||
plugins =
|
||||
pydantic.mypy
|
||||
exclude = ^(venv|build|dist)/
|
||||
ignore_missing_imports = yes
|
||||
strict_optional = yes
|
||||
check_untyped_defs = yes
|
||||
disallow_incomplete_defs = yes
|
||||
disallow_untyped_decorators = yes
|
||||
warn_unused_configs = yes
|
||||
# eventually we'd like to enable these
|
||||
disallow_untyped_defs = no
|
||||
|
||||
# try to be a bit more strict in certain areas of the codebase
|
||||
[mypy-datahub.*]
|
||||
ignore_missing_imports = no
|
||||
[mypy-tests.*]
|
||||
ignore_missing_imports = no
|
||||
|
||||
[tool:pytest]
|
||||
asyncio_mode = auto
|
||||
addopts = --cov=src --cov-report='' --cov-config setup.cfg --strict-markers -s -v
|
||||
markers =
|
||||
integration: marks tests to only run in integration (deselect with '-m "not integration"')
|
||||
|
||||
testpaths =
|
||||
tests/unit
|
||||
tests/integration
|
||||
|
||||
# [coverage:run]
|
||||
# # Because of some quirks in the way setup.cfg, coverage.py, pytest-cov,
|
||||
# # and tox interact, we should not uncomment the following line.
|
||||
# # See https://pytest-cov.readthedocs.io/en/latest/config.html and
|
||||
# # https://coverage.readthedocs.io/en/coverage-5.0/config.html.
|
||||
# # We also have some additional pytest/cov config options in tox.ini.
|
||||
# # source = src
|
||||
|
||||
# [coverage:paths]
|
||||
# # This is necessary for tox-based coverage to be counted properly.
|
||||
# source =
|
||||
# src
|
||||
# */site-packages
|
||||
|
||||
[coverage:report]
|
||||
show_missing = true
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
@abstract
|
||||
if TYPE_CHECKING:
|
251
datahub-actions/setup.py
Normal file
251
datahub-actions/setup.py
Normal file
@ -0,0 +1,251 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from typing import Dict, Set
|
||||
|
||||
import setuptools
|
||||
|
||||
package_metadata: dict = {}
|
||||
with open("./src/datahub_actions/_version.py") as fp:
|
||||
exec(fp.read(), package_metadata)
|
||||
|
||||
_version: str = package_metadata["__version__"]
|
||||
_self_pin = (
|
||||
f"=={_version}"
|
||||
if not (_version.endswith(("dev0", "dev1")) or "docker" in _version)
|
||||
else ""
|
||||
)
|
||||
|
||||
|
||||
def get_long_description():
|
||||
root = os.path.dirname(__file__)
|
||||
with open(os.path.join(root, "README.md")) as f:
|
||||
description = f.read()
|
||||
|
||||
return description
|
||||
|
||||
|
||||
lint_requirements = {
|
||||
# This is pinned only to avoid spurious errors in CI.
|
||||
# We should make an effort to keep it up to date.
|
||||
"ruff==0.11.7",
|
||||
"mypy==1.14.1",
|
||||
}
|
||||
|
||||
base_requirements = {
|
||||
f"acryl-datahub[datahub-kafka]{_self_pin}",
|
||||
# Compatibility.
|
||||
"typing_extensions>=3.7.4; python_version < '3.8'",
|
||||
"mypy_extensions>=0.4.3",
|
||||
# Actual dependencies.
|
||||
"typing-inspect",
|
||||
"pydantic>=1.10.21",
|
||||
"ratelimit",
|
||||
# Lower bounds on httpcore and h11 due to CVE-2025-43859.
|
||||
"httpcore>=1.0.9",
|
||||
"azure-identity==1.21.0",
|
||||
"aws-msk-iam-sasl-signer-python==1.0.2",
|
||||
"h11>=0.16",
|
||||
}
|
||||
|
||||
framework_common = {
|
||||
"click>=6.0.0",
|
||||
"click-default-group",
|
||||
"prometheus-client",
|
||||
"PyYAML",
|
||||
"toml>=0.10.0",
|
||||
"entrypoints",
|
||||
"python-dateutil>=2.8.0",
|
||||
"stackprinter",
|
||||
"progressbar2",
|
||||
"tenacity",
|
||||
}
|
||||
|
||||
# Note: for all of these, framework_common will be added.
|
||||
plugins: Dict[str, Set[str]] = {
|
||||
# Source Plugins
|
||||
"kafka": {
|
||||
"confluent-kafka[schemaregistry]",
|
||||
},
|
||||
# Action Plugins
|
||||
"executor": {
|
||||
"acryl-executor==0.2.2",
|
||||
},
|
||||
"slack": {
|
||||
"slack-bolt>=1.15.5",
|
||||
},
|
||||
"teams": {
|
||||
"pymsteams >=0.2.2",
|
||||
},
|
||||
"tag_propagation": set(),
|
||||
"term_propagation": set(),
|
||||
"snowflake_tag_propagation": {
|
||||
f"acryl-datahub[snowflake-slim]{_self_pin}",
|
||||
},
|
||||
"doc_propagation": set(),
|
||||
# Transformer Plugins (None yet)
|
||||
}
|
||||
|
||||
mypy_stubs = {
|
||||
"types-pytz",
|
||||
"types-dataclasses",
|
||||
"sqlalchemy-stubs",
|
||||
"types-setuptools",
|
||||
"types-six",
|
||||
"types-python-dateutil",
|
||||
"types-requests",
|
||||
"types-toml",
|
||||
"types-PyMySQL",
|
||||
"types-PyYAML",
|
||||
"types-freezegun",
|
||||
"types-cachetools",
|
||||
# versions 0.1.13 and 0.1.14 seem to have issues
|
||||
"types-click==0.1.12",
|
||||
}
|
||||
|
||||
base_dev_requirements = {
|
||||
*lint_requirements,
|
||||
*base_requirements,
|
||||
*framework_common,
|
||||
*mypy_stubs,
|
||||
"coverage>=5.1",
|
||||
"pytest>=6.2.2",
|
||||
"pytest-cov>=2.8.1",
|
||||
"pytest-dependency>=0.5.1",
|
||||
"pytest-docker>=0.10.3",
|
||||
"tox",
|
||||
"deepdiff",
|
||||
"requests-mock",
|
||||
"freezegun",
|
||||
"jsonpickle",
|
||||
"build",
|
||||
"twine",
|
||||
*list(
|
||||
dependency
|
||||
for plugin in [
|
||||
"kafka",
|
||||
"executor",
|
||||
"slack",
|
||||
"teams",
|
||||
"tag_propagation",
|
||||
"term_propagation",
|
||||
"snowflake_tag_propagation",
|
||||
"doc_propagation",
|
||||
]
|
||||
for dependency in plugins[plugin]
|
||||
),
|
||||
}
|
||||
|
||||
dev_requirements = {
|
||||
*base_dev_requirements,
|
||||
}
|
||||
|
||||
full_test_dev_requirements = {
|
||||
*list(
|
||||
dependency
|
||||
for plugin in [
|
||||
"kafka",
|
||||
"executor",
|
||||
"slack",
|
||||
"teams",
|
||||
"tag_propagation",
|
||||
"term_propagation",
|
||||
"snowflake_tag_propagation",
|
||||
"doc_propagation",
|
||||
]
|
||||
for dependency in plugins[plugin]
|
||||
),
|
||||
# In our tests, we want to always test against pydantic v2.
|
||||
# However, we maintain compatibility with pydantic v1 for now.
|
||||
"pydantic>2",
|
||||
}
|
||||
|
||||
entry_points = {
|
||||
"console_scripts": ["datahub-actions = datahub_actions.entrypoints:main"],
|
||||
"datahub_actions.action.plugins": [
|
||||
"executor = datahub_actions.plugin.action.execution.executor_action:ExecutorAction",
|
||||
"slack = datahub_actions.plugin.action.slack.slack:SlackNotificationAction",
|
||||
"teams = datahub_actions.plugin.action.teams.teams:TeamsNotificationAction",
|
||||
"metadata_change_sync = datahub_actions.plugin.action.metadata_change_sync.metadata_change_sync:MetadataChangeSyncAction",
|
||||
"tag_propagation = datahub_actions.plugin.action.tag.tag_propagation_action:TagPropagationAction",
|
||||
"term_propagation = datahub_actions.plugin.action.term.term_propagation_action:TermPropagationAction",
|
||||
"snowflake_tag_propagation = datahub_actions.plugin.action.snowflake.tag_propagator:SnowflakeTagPropagatorAction",
|
||||
"doc_propagation = datahub_actions.plugin.action.propagation.docs.propagation_action:DocPropagationAction",
|
||||
],
|
||||
"datahub_actions.transformer.plugins": [],
|
||||
"datahub_actions.source.plugins": [],
|
||||
}
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
# Package metadata.
|
||||
name=package_metadata["__package_name__"],
|
||||
version=package_metadata["__version__"],
|
||||
url="https://docs.datahub.com/",
|
||||
project_urls={
|
||||
"Documentation": "https://docs.datahub.com/docs/actions",
|
||||
"Source": "https://github.com/acryldata/datahub-actions",
|
||||
"Changelog": "https://github.com/acryldata/datahub-actions/releases",
|
||||
},
|
||||
license="Apache License 2.0",
|
||||
description="An action framework to work with DataHub real time changes.",
|
||||
long_description=get_long_description(),
|
||||
long_description_content_type="text/markdown",
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Information Technology",
|
||||
"Intended Audience :: System Administrators",
|
||||
"License :: OSI Approved",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: Unix",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Environment :: Console",
|
||||
"Environment :: MacOS X",
|
||||
"Topic :: Software Development",
|
||||
],
|
||||
# Package info.
|
||||
zip_safe=False,
|
||||
python_requires=">=3.8",
|
||||
package_dir={"": "src"},
|
||||
packages=setuptools.find_namespace_packages(where="./src"),
|
||||
package_data={
|
||||
"datahub_actions": ["py.typed"],
|
||||
},
|
||||
entry_points=entry_points,
|
||||
# Dependencies.
|
||||
install_requires=list(base_requirements | framework_common),
|
||||
extras_require={
|
||||
"base": list(framework_common),
|
||||
**{
|
||||
plugin: list(framework_common | dependencies)
|
||||
for (plugin, dependencies) in plugins.items()
|
||||
},
|
||||
"all": list(
|
||||
framework_common.union(
|
||||
*[requirements for plugin, requirements in plugins.items()]
|
||||
)
|
||||
),
|
||||
"dev": list(dev_requirements),
|
||||
"integration-tests": list(full_test_dev_requirements),
|
||||
},
|
||||
)
|
15
datahub-actions/src/datahub_actions/__init__.py
Normal file
15
datahub-actions/src/datahub_actions/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from datahub_actions._version import __package_name__, __version__
|
13
datahub-actions/src/datahub_actions/_version.py
Normal file
13
datahub-actions/src/datahub_actions/_version.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Published at https://pypi.org/project/acryl-datahub-actions/.
|
||||
__package_name__ = "acryl-datahub-actions"
|
||||
__version__ = "1!0.0.0.dev0"
|
||||
|
||||
|
||||
def is_dev_mode() -> bool:
|
||||
return __version__.endswith("dev0")
|
||||
|
||||
|
||||
def nice_version_name() -> str:
|
||||
if is_dev_mode():
|
||||
return "unavailable (installed in develop mode)"
|
||||
return __version__
|
13
datahub-actions/src/datahub_actions/action/__init__.py
Normal file
13
datahub-actions/src/datahub_actions/action/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
41
datahub-actions/src/datahub_actions/action/action.py
Normal file
41
datahub-actions/src/datahub_actions/action/action.py
Normal file
@ -0,0 +1,41 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from datahub.ingestion.api.closeable import Closeable
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
|
||||
|
||||
class Action(Closeable, metaclass=ABCMeta):
|
||||
"""
|
||||
The base class for all DataHub Actions.
|
||||
|
||||
A DataHub action is a component capable of performing a specific action (notification, auditing, synchronization, & more)
|
||||
when important events occur on DataHub.
|
||||
|
||||
Each Action may provide its own semantics, configurations, compatibility and guarantees.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
"""Factory method to create an instance of an Action"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
"""Take Action on DataHub events, provided an instance of a DataHub event."""
|
||||
pass
|
@ -0,0 +1,21 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from datahub.ingestion.api.registry import PluginRegistry
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.plugin.action.hello_world.hello_world import HelloWorldAction
|
||||
|
||||
action_registry = PluginRegistry[Action]()
|
||||
action_registry.register_from_entrypoint("datahub_actions.action.plugins")
|
||||
action_registry.register("hello_world", HelloWorldAction)
|
40
datahub-actions/src/datahub_actions/action/action_stats.py
Normal file
40
datahub-actions/src/datahub_actions/action/action_stats.py
Normal file
@ -0,0 +1,40 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
|
||||
|
||||
# Class that stores running statistics for a single Action.
|
||||
# TODO: Invocation time tracking.
|
||||
class ActionStats:
|
||||
# The number of exception raised by the Action.
|
||||
exception_count: int = 0
|
||||
|
||||
# The number of events that were actually submitted to the Action
|
||||
success_count: int = 0
|
||||
|
||||
def increment_exception_count(self) -> None:
|
||||
self.exception_count = self.exception_count + 1
|
||||
|
||||
def get_exception_count(self) -> int:
|
||||
return self.exception_count
|
||||
|
||||
def increment_success_count(self) -> None:
|
||||
self.success_count = self.success_count + 1
|
||||
|
||||
def get_success_count(self) -> int:
|
||||
return self.success_count
|
||||
|
||||
def as_string(self) -> str:
|
||||
return json.dumps(self.__dict__, indent=4, sort_keys=True)
|
13
datahub-actions/src/datahub_actions/api/__init__.py
Normal file
13
datahub-actions/src/datahub_actions/api/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
413
datahub-actions/src/datahub_actions/api/action_graph.py
Normal file
413
datahub-actions/src/datahub_actions/api/action_graph.py
Normal file
@ -0,0 +1,413 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import urllib.parse
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from datahub.configuration.common import OperationalError
|
||||
from datahub.ingestion.graph.client import DataHubGraph
|
||||
from datahub.metadata.schema_classes import (
|
||||
GlossaryTermAssociationClass,
|
||||
TagAssociationClass,
|
||||
)
|
||||
from datahub.specific.dataset import DatasetPatchBuilder
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AcrylDataHubGraph:
|
||||
def __init__(self, baseGraph: DataHubGraph):
|
||||
self.graph = baseGraph
|
||||
|
||||
def get_by_query(
|
||||
self,
|
||||
query: str,
|
||||
entity: str,
|
||||
start: int = 0,
|
||||
count: int = 100,
|
||||
filters: Optional[Dict] = None,
|
||||
) -> List[Dict]:
|
||||
url_frag = "/entities?action=search"
|
||||
url = f"{self.graph._gms_server}{url_frag}"
|
||||
payload = {"input": query, "start": start, "count": count, "entity": entity}
|
||||
if filters is not None:
|
||||
payload["filter"] = filters
|
||||
|
||||
headers = {
|
||||
"X-RestLi-Protocol-Version": "2.0.0",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
try:
|
||||
response = self.graph._session.post(
|
||||
url, data=json.dumps(payload), headers=headers
|
||||
)
|
||||
if response.status_code != 200:
|
||||
return []
|
||||
json_resp = response.json()
|
||||
return json_resp.get("value", {}).get("entities")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return []
|
||||
|
||||
def get_by_graphql_query(self, query: Dict) -> Dict:
|
||||
url_frag = "/api/graphql"
|
||||
url = f"{self.graph._gms_server}{url_frag}"
|
||||
|
||||
headers = {
|
||||
"X-DataHub-Actor": "urn:li:corpuser:admin",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
try:
|
||||
response = self.graph._session.post(
|
||||
url, data=json.dumps(query), headers=headers
|
||||
)
|
||||
if response.status_code != 200:
|
||||
return {}
|
||||
json_resp = response.json()
|
||||
return json_resp.get("data", {})
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return {}
|
||||
|
||||
def query_constraints_for_dataset(self, dataset_id: str) -> List:
|
||||
resp = self.get_by_graphql_query(
|
||||
{
|
||||
"query": """
|
||||
query dataset($input: String!) {
|
||||
dataset(urn: $input) {
|
||||
constraints {
|
||||
type
|
||||
displayName
|
||||
description
|
||||
params {
|
||||
hasGlossaryTermInNodeParams {
|
||||
nodeName
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
"variables": {"input": dataset_id},
|
||||
}
|
||||
)
|
||||
constraints: List = resp.get("dataset", {}).get("constraints", [])
|
||||
return constraints
|
||||
|
||||
def query_execution_result_details(self, execution_id: str) -> Any:
|
||||
resp = self.get_by_graphql_query(
|
||||
{
|
||||
"query": """
|
||||
query executionRequest($urn: String!) {
|
||||
executionRequest(urn: $urn) {
|
||||
input {
|
||||
task
|
||||
arguments {
|
||||
key
|
||||
value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
"variables": {"urn": f"urn:li:dataHubExecutionRequest:{execution_id}"},
|
||||
}
|
||||
)
|
||||
return resp.get("executionRequest", {}).get("input", {})
|
||||
|
||||
def query_ingestion_sources(self) -> List:
|
||||
sources = []
|
||||
start, count = 0, 10
|
||||
while True:
|
||||
resp = self.get_by_graphql_query(
|
||||
{
|
||||
"query": """
|
||||
query listIngestionSources($input: ListIngestionSourcesInput!, $execution_start: Int!, $execution_count: Int!) {
|
||||
listIngestionSources(input: $input) {
|
||||
start
|
||||
count
|
||||
total
|
||||
ingestionSources {
|
||||
urn
|
||||
type
|
||||
name
|
||||
executions(start: $execution_start, count: $execution_count) {
|
||||
start
|
||||
count
|
||||
total
|
||||
executionRequests {
|
||||
urn
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
"variables": {
|
||||
"input": {"start": start, "count": count},
|
||||
"execution_start": 0,
|
||||
"execution_count": 10,
|
||||
},
|
||||
}
|
||||
)
|
||||
listIngestionSources = resp.get("listIngestionSources", {})
|
||||
sources.extend(listIngestionSources.get("ingestionSources", []))
|
||||
|
||||
cur_total = listIngestionSources.get("total", 0)
|
||||
if cur_total > count:
|
||||
start += count
|
||||
else:
|
||||
break
|
||||
return sources
|
||||
|
||||
def get_downstreams(
|
||||
self, entity_urn: str, max_downstreams: int = 3000
|
||||
) -> List[str]:
|
||||
start = 0
|
||||
count_per_page = 1000
|
||||
entities = []
|
||||
done = False
|
||||
total_downstreams = 0
|
||||
while not done:
|
||||
# if start > 0:
|
||||
# breakpoint()
|
||||
url_frag = f"/relationships?direction=INCOMING&types=List(DownstreamOf)&urn={urllib.parse.quote(entity_urn)}&count={count_per_page}&start={start}"
|
||||
url = f"{self.graph._gms_server}{url_frag}"
|
||||
response = self.graph._get_generic(url)
|
||||
if response["count"] > 0:
|
||||
relnships = response["relationships"]
|
||||
entities.extend([x["entity"] for x in relnships])
|
||||
start += count_per_page
|
||||
total_downstreams += response["count"]
|
||||
if start >= response["total"] or total_downstreams >= max_downstreams:
|
||||
done = True
|
||||
else:
|
||||
done = True
|
||||
return entities
|
||||
|
||||
def get_upstreams(self, entity_urn: str, max_upstreams: int = 3000) -> List[str]:
|
||||
start = 0
|
||||
count_per_page = 100
|
||||
entities = []
|
||||
done = False
|
||||
total_upstreams = 0
|
||||
while not done:
|
||||
url_frag = f"/relationships?direction=OUTGOING&types=List(DownstreamOf)&urn={urllib.parse.quote(entity_urn)}&count={count_per_page}&start={start}"
|
||||
url = f"{self.graph._gms_server}{url_frag}"
|
||||
response = self.graph._get_generic(url)
|
||||
if response["count"] > 0:
|
||||
relnships = response["relationships"]
|
||||
entities.extend([x["entity"] for x in relnships])
|
||||
start += count_per_page
|
||||
total_upstreams += response["count"]
|
||||
if start >= response["total"] or total_upstreams >= max_upstreams:
|
||||
done = True
|
||||
else:
|
||||
done = True
|
||||
return entities
|
||||
|
||||
def get_relationships(
|
||||
self, entity_urn: str, direction: str, relationship_types: List[str]
|
||||
) -> List[str]:
|
||||
url_frag = (
|
||||
f"/relationships?"
|
||||
f"direction={direction}"
|
||||
f"&types=List({','.join(relationship_types)})"
|
||||
f"&urn={urllib.parse.quote(entity_urn)}"
|
||||
)
|
||||
|
||||
url = f"{self.graph._gms_server}{url_frag}"
|
||||
response = self.graph._get_generic(url)
|
||||
if response["count"] > 0:
|
||||
relnships = response["relationships"]
|
||||
entities = [x["entity"] for x in relnships]
|
||||
return entities
|
||||
return []
|
||||
|
||||
def check_relationship(self, entity_urn, target_urn, relationship_type):
|
||||
url_frag = f"/relationships?direction=INCOMING&types=List({relationship_type})&urn={urllib.parse.quote(entity_urn)}"
|
||||
url = f"{self.graph._gms_server}{url_frag}"
|
||||
response = self.graph._get_generic(url)
|
||||
if response["count"] > 0:
|
||||
relnships = response["relationships"]
|
||||
entities = [x["entity"] for x in relnships]
|
||||
return target_urn in entities
|
||||
return False
|
||||
|
||||
def add_tags_to_dataset(
|
||||
self,
|
||||
entity_urn: str,
|
||||
dataset_tags: List[str],
|
||||
field_tags: Optional[Dict] = None,
|
||||
context: Optional[Dict] = None,
|
||||
) -> None:
|
||||
if field_tags is None:
|
||||
field_tags = {}
|
||||
dataset = DatasetPatchBuilder(entity_urn)
|
||||
for t in dataset_tags:
|
||||
dataset.add_tag(
|
||||
tag=TagAssociationClass(
|
||||
tag=t, context=json.dumps(context) if context else None
|
||||
)
|
||||
)
|
||||
|
||||
for field_path, tags in field_tags.items():
|
||||
field_builder = dataset.for_field(field_path=field_path)
|
||||
for tag in tags:
|
||||
field_builder.add_tag(
|
||||
tag=TagAssociationClass(
|
||||
tag=tag, context=json.dumps(context) if context else None
|
||||
)
|
||||
)
|
||||
|
||||
for mcp in dataset.build():
|
||||
self.graph.emit(mcp)
|
||||
|
||||
def add_terms_to_dataset(
|
||||
self,
|
||||
entity_urn: str,
|
||||
dataset_terms: List[str],
|
||||
field_terms: Optional[Dict] = None,
|
||||
context: Optional[Dict] = None,
|
||||
) -> None:
|
||||
if field_terms is None:
|
||||
field_terms = {}
|
||||
|
||||
dataset = DatasetPatchBuilder(urn=entity_urn)
|
||||
|
||||
for term in dataset_terms:
|
||||
dataset.add_term(
|
||||
GlossaryTermAssociationClass(
|
||||
term, context=json.dumps(context) if context else None
|
||||
)
|
||||
)
|
||||
|
||||
for field_path, terms in field_terms.items():
|
||||
field_builder = dataset.for_field(field_path=field_path)
|
||||
for term in terms:
|
||||
field_builder.add_term(
|
||||
GlossaryTermAssociationClass(
|
||||
term, context=json.dumps(context) if context else None
|
||||
)
|
||||
)
|
||||
|
||||
for mcp in dataset.build():
|
||||
self.graph.emit(mcp)
|
||||
|
||||
def get_corpuser_info(self, urn: str) -> Any:
|
||||
return self.get_untyped_aspect(
|
||||
urn, "corpUserInfo", "com.linkedin.identity.CorpUserInfo"
|
||||
)
|
||||
|
||||
def get_untyped_aspect(
|
||||
self,
|
||||
entity_urn: str,
|
||||
aspect: str,
|
||||
aspect_type_name: str,
|
||||
) -> Any:
|
||||
url = f"{self.graph._gms_server}/aspects/{urllib.parse.quote(entity_urn)}?aspect={aspect}&version=0"
|
||||
response = self.graph._session.get(url)
|
||||
if response.status_code == 404:
|
||||
# not found
|
||||
return None
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
aspect_json = response_json.get("aspect", {}).get(aspect_type_name)
|
||||
if aspect_json:
|
||||
return aspect_json
|
||||
else:
|
||||
raise OperationalError(
|
||||
f"Failed to find {aspect_type_name} in response {response_json}"
|
||||
)
|
||||
|
||||
def _get_entity_by_name(
|
||||
self,
|
||||
name: str,
|
||||
entity_type: str,
|
||||
indexed_fields: Optional[List[str]] = None,
|
||||
) -> Optional[str]:
|
||||
"""Retrieve an entity urn based on its name and type. Returns None if there is no match found"""
|
||||
if indexed_fields is None:
|
||||
indexed_fields = ["name", "displayName"]
|
||||
|
||||
filters = []
|
||||
if len(indexed_fields) > 1:
|
||||
for indexed_field in indexed_fields:
|
||||
filter_criteria = [
|
||||
{
|
||||
"field": indexed_field,
|
||||
"value": name,
|
||||
"condition": "EQUAL",
|
||||
}
|
||||
]
|
||||
filters.append({"and": filter_criteria})
|
||||
search_body = {
|
||||
"input": "*",
|
||||
"entity": entity_type,
|
||||
"start": 0,
|
||||
"count": 10,
|
||||
"orFilters": [filters],
|
||||
}
|
||||
else:
|
||||
search_body = {
|
||||
"input": "*",
|
||||
"entity": entity_type,
|
||||
"start": 0,
|
||||
"count": 10,
|
||||
"filter": {
|
||||
"or": [
|
||||
{
|
||||
"and": [
|
||||
{
|
||||
"field": indexed_fields[0],
|
||||
"value": name,
|
||||
"condition": "EQUAL",
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
results: Dict = self.graph._post_generic(
|
||||
self.graph._search_endpoint, search_body
|
||||
)
|
||||
num_entities = results.get("value", {}).get("numEntities", 0)
|
||||
if num_entities > 1:
|
||||
logger.warning(
|
||||
f"Got {num_entities} results for {entity_type} {name}. Will return the first match."
|
||||
)
|
||||
entities_yielded: int = 0
|
||||
entities = []
|
||||
for x in results["value"]["entities"]:
|
||||
entities_yielded += 1
|
||||
logger.debug(f"yielding {x['entity']}")
|
||||
entities.append(x["entity"])
|
||||
return entities[0] if entities_yielded else None
|
||||
|
||||
def get_glossary_term_urn_by_name(self, term_name: str) -> Optional[str]:
|
||||
"""Retrieve a glossary term urn based on its name. Returns None if there is no match found"""
|
||||
|
||||
return self._get_entity_by_name(
|
||||
term_name, "glossaryTerm", indexed_fields=["name"]
|
||||
)
|
||||
|
||||
def get_glossary_node_urn_by_name(self, node_name: str) -> Optional[str]:
|
||||
"""Retrieve a glossary node urn based on its name. Returns None if there is no match found"""
|
||||
|
||||
return self._get_entity_by_name(node_name, "glossaryNode")
|
13
datahub-actions/src/datahub_actions/cli/__init__.py
Normal file
13
datahub-actions/src/datahub_actions/cli/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
191
datahub-actions/src/datahub_actions/cli/actions.py
Normal file
191
datahub-actions/src/datahub_actions/cli/actions.py
Normal file
@ -0,0 +1,191 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import pathlib
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, List
|
||||
|
||||
import click
|
||||
from click_default_group import DefaultGroup
|
||||
from expandvars import UnboundVariable
|
||||
|
||||
import datahub_actions._version as actions_version
|
||||
from datahub.configuration.config_loader import load_config_file
|
||||
from datahub_actions.pipeline.pipeline import Pipeline
|
||||
from datahub_actions.pipeline.pipeline_manager import PipelineManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Instantiate a singleton instance of the Pipeline Manager.
|
||||
pipeline_manager = PipelineManager()
|
||||
|
||||
|
||||
def pipeline_config_to_pipeline(pipeline_config: dict) -> Pipeline:
|
||||
logger.debug(
|
||||
f"Attempting to create Actions Pipeline using config {pipeline_config.get('name')}"
|
||||
)
|
||||
try:
|
||||
return Pipeline.create(pipeline_config)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Failed to instantiate Actions Pipeline using config {pipeline_config.get('name')}: {e}"
|
||||
) from e
|
||||
|
||||
|
||||
@click.group(cls=DefaultGroup, default="run")
|
||||
def actions() -> None:
|
||||
"""Execute one or more Actions Pipelines"""
|
||||
pass
|
||||
|
||||
|
||||
def load_raw_config_file(config_file: pathlib.Path) -> dict:
|
||||
"""
|
||||
Load a config file as raw YAML/JSON without variable expansion.
|
||||
|
||||
Args:
|
||||
config_file: Path to the configuration file
|
||||
|
||||
Returns:
|
||||
dict: Raw configuration dictionary
|
||||
|
||||
Raises:
|
||||
Exception: If the file cannot be loaded or is invalid YAML/JSON
|
||||
"""
|
||||
try:
|
||||
with open(config_file, "r") as f:
|
||||
import yaml
|
||||
|
||||
return yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Failed to load raw configuration file {config_file}: {e}"
|
||||
) from e
|
||||
|
||||
|
||||
def is_pipeline_enabled(config: dict) -> bool:
|
||||
"""
|
||||
Check if a pipeline configuration is enabled.
|
||||
|
||||
Args:
|
||||
config: Raw configuration dictionary
|
||||
|
||||
Returns:
|
||||
bool: True if pipeline is enabled, False otherwise
|
||||
"""
|
||||
enabled = config.get("enabled", True)
|
||||
return not (enabled == "false" or enabled is False)
|
||||
|
||||
|
||||
@actions.command(
|
||||
name="run",
|
||||
context_settings=dict(
|
||||
ignore_unknown_options=True,
|
||||
allow_extra_args=True,
|
||||
),
|
||||
)
|
||||
@click.option("-c", "--config", required=True, type=str, multiple=True)
|
||||
@click.option("--debug/--no-debug", default=False)
|
||||
@click.pass_context
|
||||
def run(ctx: Any, config: List[str], debug: bool) -> None:
|
||||
"""Execute one or more Actions Pipelines"""
|
||||
|
||||
logger.info("DataHub Actions version: %s", actions_version.nice_version_name())
|
||||
|
||||
if debug:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
else:
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
pipelines: List[Pipeline] = []
|
||||
logger.debug("Creating Actions Pipelines...")
|
||||
|
||||
# Phase 1: Initial validation of configs
|
||||
valid_configs = []
|
||||
for pipeline_config in config:
|
||||
pipeline_config_file = pathlib.Path(pipeline_config)
|
||||
try:
|
||||
# First just load the raw config to check if it's enabled
|
||||
raw_config = load_raw_config_file(pipeline_config_file)
|
||||
|
||||
if not is_pipeline_enabled(raw_config):
|
||||
logger.warning(
|
||||
f"Skipping pipeline {raw_config.get('name') or pipeline_config} as it is not enabled"
|
||||
)
|
||||
continue
|
||||
|
||||
valid_configs.append(pipeline_config_file)
|
||||
|
||||
except Exception as e:
|
||||
if len(config) == 1:
|
||||
raise Exception(
|
||||
f"Failed to load raw configuration file {pipeline_config_file}"
|
||||
) from e
|
||||
logger.warning(
|
||||
f"Failed to load pipeline configuration! Skipping action config file {pipeline_config_file}...: {e}"
|
||||
)
|
||||
|
||||
# Phase 2: Full config loading and pipeline creation
|
||||
for pipeline_config_file in valid_configs:
|
||||
try:
|
||||
# Now load the full config with variable expansion
|
||||
pipeline_config_dict = load_config_file(pipeline_config_file)
|
||||
pipelines.append(pipeline_config_to_pipeline(pipeline_config_dict))
|
||||
except UnboundVariable as e:
|
||||
if len(valid_configs) == 1:
|
||||
raise Exception(
|
||||
"Failed to load action configuration. Unbound variable(s) provided in config YAML."
|
||||
) from e
|
||||
logger.warning(
|
||||
f"Failed to resolve variables in config file {pipeline_config_file}...: {e}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Exit early if no valid pipelines were created
|
||||
if not pipelines:
|
||||
logger.error(
|
||||
f"No valid pipelines were started from {len(config)} config(s). "
|
||||
"Check that at least one pipeline is enabled and all required environment variables are set."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
logger.debug("Starting Actions Pipelines")
|
||||
|
||||
# Start each pipeline
|
||||
for p in pipelines:
|
||||
pipeline_manager.start_pipeline(p.name, p)
|
||||
logger.info(f"Action Pipeline with name '{p.name}' is now running.")
|
||||
|
||||
# Now, run forever only if we have valid pipelines
|
||||
while True:
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@actions.command()
|
||||
def version() -> None:
|
||||
"""Print version number and exit."""
|
||||
click.echo(f"DataHub Actions version: {actions_version.nice_version_name()}")
|
||||
click.echo(f"Python version: {sys.version}")
|
||||
|
||||
|
||||
# Handle shutdown signal. (ctrl-c)
|
||||
def handle_shutdown(signum: int, frame: Any) -> None:
|
||||
logger.info("Stopping all running Action Pipelines...")
|
||||
pipeline_manager.stop_all()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
signal.signal(signal.SIGINT, handle_shutdown)
|
140
datahub-actions/src/datahub_actions/entrypoints.py
Normal file
140
datahub-actions/src/datahub_actions/entrypoints.py
Normal file
@ -0,0 +1,140 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import platform
|
||||
import sys
|
||||
|
||||
import click
|
||||
import stackprinter
|
||||
from prometheus_client import start_http_server
|
||||
|
||||
import datahub_actions._version as actions_version
|
||||
from datahub.cli.env_utils import get_boolean_env_variable
|
||||
from datahub_actions.cli.actions import actions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configure logger.
|
||||
BASE_LOGGING_FORMAT = (
|
||||
"[%(asctime)s] %(levelname)-8s {%(name)s:%(lineno)d} - %(message)s"
|
||||
)
|
||||
logging.basicConfig(format=BASE_LOGGING_FORMAT)
|
||||
|
||||
MAX_CONTENT_WIDTH = 120
|
||||
|
||||
|
||||
@click.group(
|
||||
context_settings=dict(
|
||||
# Avoid truncation of help text.
|
||||
# See https://github.com/pallets/click/issues/486.
|
||||
max_content_width=MAX_CONTENT_WIDTH,
|
||||
)
|
||||
)
|
||||
@click.option(
|
||||
"--enable-monitoring",
|
||||
type=bool,
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Enable prometheus monitoring endpoint. You can set the portnumber with --monitoring-port.",
|
||||
)
|
||||
@click.option(
|
||||
"--monitoring-port",
|
||||
type=int,
|
||||
default=8000,
|
||||
help="""Prometheus monitoring endpoint will be available on :<PORT>/metrics.
|
||||
To enable monitoring use the --enable-monitoring flag
|
||||
""",
|
||||
)
|
||||
@click.option("--debug/--no-debug", default=False)
|
||||
@click.version_option(
|
||||
version=actions_version.nice_version_name(),
|
||||
prog_name=actions_version.__package_name__,
|
||||
)
|
||||
@click.option(
|
||||
"-dl",
|
||||
"--detect-memory-leaks",
|
||||
type=bool,
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Run memory leak detection.",
|
||||
)
|
||||
@click.pass_context
|
||||
def datahub_actions(
|
||||
ctx: click.Context,
|
||||
enable_monitoring: bool,
|
||||
monitoring_port: int,
|
||||
debug: bool,
|
||||
detect_memory_leaks: bool,
|
||||
) -> None:
|
||||
# Insulate 'datahub_actions' and all child loggers from inadvertent changes to the
|
||||
# root logger by the external site packages that we import.
|
||||
# (Eg: https://github.com/reata/sqllineage/commit/2df027c77ea0a8ea4909e471dcd1ecbf4b8aeb2f#diff-30685ea717322cd1e79c33ed8d37903eea388e1750aa00833c33c0c5b89448b3R11
|
||||
# changes the root logger's handler level to WARNING, causing any message below
|
||||
# WARNING level to be dropped after this module is imported, irrespective
|
||||
# of the logger's logging level! The lookml source was affected by this).
|
||||
|
||||
# 1. Create 'datahub' parent logger.
|
||||
datahub_logger = logging.getLogger("datahub_actions")
|
||||
# 2. Setup the stream handler with formatter.
|
||||
stream_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(BASE_LOGGING_FORMAT)
|
||||
stream_handler.setFormatter(formatter)
|
||||
datahub_logger.addHandler(stream_handler)
|
||||
# 3. Turn off propagation to the root handler.
|
||||
datahub_logger.propagate = False
|
||||
# 4. Adjust log-levels.
|
||||
if debug or get_boolean_env_variable("DATAHUB_DEBUG", False):
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
datahub_logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
logging.getLogger().setLevel(logging.WARNING)
|
||||
datahub_logger.setLevel(logging.INFO)
|
||||
if enable_monitoring:
|
||||
start_http_server(monitoring_port)
|
||||
# Setup the context for the memory_leak_detector decorator.
|
||||
ctx.ensure_object(dict)
|
||||
ctx.obj["detect_memory_leaks"] = detect_memory_leaks
|
||||
|
||||
|
||||
def main(**kwargs):
|
||||
# This wrapper prevents click from suppressing errors.
|
||||
try:
|
||||
sys.exit(datahub_actions(standalone_mode=False, **kwargs))
|
||||
except click.exceptions.Abort:
|
||||
# Click already automatically prints an abort message, so we can just exit.
|
||||
sys.exit(1)
|
||||
except click.ClickException as error:
|
||||
error.show()
|
||||
sys.exit(1)
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
stackprinter.format(
|
||||
exc,
|
||||
line_wrap=MAX_CONTENT_WIDTH,
|
||||
truncate_vals=10 * MAX_CONTENT_WIDTH,
|
||||
suppressed_paths=[r"lib/python.*/site-packages/click/"],
|
||||
show_vals=False,
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
f"DataHub Actions version: {actions_version.__version__} at {actions_version.__file__}"
|
||||
)
|
||||
logger.info(
|
||||
f"Python version: {sys.version} at {sys.executable} on {platform.platform()}"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
datahub_actions.add_command(actions)
|
13
datahub-actions/src/datahub_actions/event/__init__.py
Normal file
13
datahub-actions/src/datahub_actions/event/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
34
datahub-actions/src/datahub_actions/event/event.py
Normal file
34
datahub-actions/src/datahub_actions/event/event.py
Normal file
@ -0,0 +1,34 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
|
||||
class Event(metaclass=ABCMeta):
|
||||
"""
|
||||
A DataHub Event.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def from_json(cls, json_str: str) -> "Event":
|
||||
"""
|
||||
Convert from json format into the event object.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def as_json(self) -> str:
|
||||
"""
|
||||
Convert the event into its JSON representation.
|
||||
"""
|
60
datahub-actions/src/datahub_actions/event/event_envelope.py
Normal file
60
datahub-actions/src/datahub_actions/event/event_envelope.py
Normal file
@ -0,0 +1,60 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict
|
||||
|
||||
from datahub_actions.event.event import Event
|
||||
from datahub_actions.event.event_registry import event_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# An object representation of the actual change event.
|
||||
@dataclass
|
||||
class EventEnvelope:
|
||||
# The type of the event. This corresponds to the shape of the payload.
|
||||
event_type: str
|
||||
|
||||
# The event itself
|
||||
event: Event
|
||||
|
||||
# Arbitrary metadata about the event
|
||||
meta: Dict[str, Any]
|
||||
|
||||
# Convert an enveloped event to JSON representation
|
||||
def as_json(self) -> str:
|
||||
# Be careful about converting meta bag, since anything can be put inside at runtime.
|
||||
meta_json = None
|
||||
try:
|
||||
if self.meta is not None:
|
||||
meta_json = json.dumps(self.meta)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
f"Failed to serialize meta field of EventEnvelope to json {self.meta}. Ignoring it during serialization."
|
||||
)
|
||||
result = f'{{ "event_type": "{self.event_type}", "event": {self.event.as_json()}, "meta": {meta_json if meta_json is not None else "null"} }}'
|
||||
return result
|
||||
|
||||
# Convert a json event envelope back into the object.
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "EventEnvelope":
|
||||
json_obj = json.loads(json_str)
|
||||
event_type = json_obj["event_type"]
|
||||
event_class = event_registry.get(event_type)
|
||||
event = event_class.from_json(json.dumps(json_obj["event"]))
|
||||
meta = json_obj["meta"] if "meta" in json_obj else {}
|
||||
return EventEnvelope(event_type=event_type, event=event, meta=meta)
|
93
datahub-actions/src/datahub_actions/event/event_registry.py
Normal file
93
datahub-actions/src/datahub_actions/event/event_registry.py
Normal file
@ -0,0 +1,93 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
|
||||
from datahub.ingestion.api.registry import PluginRegistry
|
||||
from datahub.metadata.schema_classes import (
|
||||
EntityChangeEventClass,
|
||||
MetadataChangeLogClass,
|
||||
)
|
||||
from datahub_actions.event.event import Event
|
||||
|
||||
# TODO: Figure out where to put these.
|
||||
# TODO: Perform runtime validation based on the event types found in the registry.
|
||||
|
||||
|
||||
# A DataHub Event representing a Metadata Change Log Event.
|
||||
# See MetadataChangeLogEvent class object for full field set.
|
||||
class MetadataChangeLogEvent(MetadataChangeLogClass, Event):
|
||||
@classmethod
|
||||
def from_class(cls, clazz: MetadataChangeLogClass) -> "MetadataChangeLogEvent":
|
||||
instance = cls._construct({})
|
||||
instance._restore_defaults()
|
||||
# Shallow map inner dictionaries.
|
||||
instance._inner_dict = clazz._inner_dict
|
||||
return instance
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "Event":
|
||||
json_obj = json.loads(json_str)
|
||||
return cls.from_class(cls.from_obj(json_obj))
|
||||
|
||||
def as_json(self) -> str:
|
||||
return json.dumps(self.to_obj())
|
||||
|
||||
|
||||
# A DataHub Event representing an Entity Change Event.
|
||||
# See EntityChangeEventClass class object for full field set.
|
||||
class EntityChangeEvent(EntityChangeEventClass, Event):
|
||||
@classmethod
|
||||
def from_class(cls, clazz: EntityChangeEventClass) -> "EntityChangeEvent":
|
||||
instance = cls._construct({})
|
||||
instance._restore_defaults()
|
||||
# Shallow map inner dictionaries.
|
||||
instance._inner_dict = clazz._inner_dict
|
||||
return instance
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "EntityChangeEvent":
|
||||
json_obj = json.loads(json_str)
|
||||
|
||||
# Remove parameters from json_obj and add it later to _inner_dict, this hack exists because of the way EntityChangeLogClass does not support "AnyRecord"
|
||||
parameters = json_obj.pop("parameters", None)
|
||||
|
||||
event = cls.from_class(cls.from_obj(json_obj))
|
||||
|
||||
# Hack: Since parameters is an "AnyRecord" (arbitrary json) we have to insert into the underlying map directly
|
||||
# to avoid validation at object creation time. This means the reader is responsible to understand the serialized JSON format, which
|
||||
# is simply PDL serialized to JSON.
|
||||
if parameters:
|
||||
event._inner_dict["__parameters_json"] = parameters
|
||||
|
||||
return event
|
||||
|
||||
def as_json(self) -> str:
|
||||
json_obj = self.to_obj()
|
||||
# Insert parameters, this hack exists because of the way EntityChangeLogClass does not support "AnyRecord"
|
||||
if "__parameters_json" in self._inner_dict:
|
||||
json_obj["parameters"] = self._inner_dict["__parameters_json"]
|
||||
return json.dumps(json_obj)
|
||||
|
||||
|
||||
# Standard Event Types for easy reference.
|
||||
ENTITY_CHANGE_EVENT_V1_TYPE = "EntityChangeEvent_v1"
|
||||
METADATA_CHANGE_LOG_EVENT_V1_TYPE = "MetadataChangeLogEvent_v1"
|
||||
|
||||
# Lightweight Event Registry
|
||||
event_registry = PluginRegistry[Event]()
|
||||
|
||||
# Register standard event library. Each type can be considered a separate "stream" / "topic"
|
||||
event_registry.register(METADATA_CHANGE_LOG_EVENT_V1_TYPE, MetadataChangeLogEvent)
|
||||
event_registry.register(ENTITY_CHANGE_EVENT_V1_TYPE, EntityChangeEvent)
|
13
datahub-actions/src/datahub_actions/pipeline/__init__.py
Normal file
13
datahub-actions/src/datahub_actions/pipeline/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
323
datahub-actions/src/datahub_actions/pipeline/pipeline.py
Normal file
323
datahub-actions/src/datahub_actions/pipeline/pipeline.py
Normal file
@ -0,0 +1,323 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.pipeline.pipeline_config import FailureMode, PipelineConfig
|
||||
from datahub_actions.pipeline.pipeline_stats import PipelineStats
|
||||
from datahub_actions.pipeline.pipeline_util import (
|
||||
create_action,
|
||||
create_action_context,
|
||||
create_event_source,
|
||||
create_filter_transformer,
|
||||
create_transformer,
|
||||
normalize_directory_name,
|
||||
)
|
||||
from datahub_actions.source.event_source import EventSource
|
||||
from datahub_actions.transform.transformer import Transformer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Defaults for the location where failed events will be written.
|
||||
DEFAULT_RETRY_COUNT = 0 # Do not retry unless instructed.
|
||||
DEFAULT_FAILED_EVENTS_DIR = "/tmp/logs/datahub/actions"
|
||||
DEFAULT_FAILED_EVENTS_FILE_NAME = "failed_events.log" # Not currently configurable.
|
||||
DEFAULT_FAILURE_MODE = FailureMode.CONTINUE
|
||||
|
||||
|
||||
class PipelineException(Exception):
|
||||
"""
|
||||
An exception thrown when a Pipeline encounters and unrecoverable situation.
|
||||
Mainly a placeholder for now.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Pipeline:
|
||||
"""
|
||||
A Pipeline is responsible for coordinating execution of a single DataHub Action.
|
||||
|
||||
This responsibility includes:
|
||||
|
||||
- sourcing events from an Event Source
|
||||
- executing a configurable chain of Transformers
|
||||
- invoking an Action with the final Event
|
||||
- acknowledging the processing of an Event with the Event Source
|
||||
|
||||
Additionally, a Pipeline supports the following notable capabilities:
|
||||
|
||||
- Configurable retries of event processing in cases of component failure
|
||||
- Configurable dead letter queue
|
||||
- Capturing basic statistics about each Pipeline component
|
||||
- At-will start and stop of an individual pipeline
|
||||
|
||||
"""
|
||||
|
||||
name: str
|
||||
source: EventSource
|
||||
transforms: List[Transformer] = []
|
||||
action: Action
|
||||
|
||||
# Whether the Pipeline has been requested to shut down
|
||||
_shutdown: bool = False
|
||||
|
||||
# Pipeline statistics
|
||||
_stats: PipelineStats = PipelineStats()
|
||||
|
||||
# Options
|
||||
_retry_count: int = DEFAULT_RETRY_COUNT # Number of times a single event should be retried in case of processing error.
|
||||
_failure_mode: FailureMode = DEFAULT_FAILURE_MODE
|
||||
_failed_events_dir: str = DEFAULT_FAILED_EVENTS_DIR # The top-level path where failed events will be logged.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
source: EventSource,
|
||||
transforms: List[Transformer],
|
||||
action: Action,
|
||||
retry_count: Optional[int],
|
||||
failure_mode: Optional[FailureMode],
|
||||
failed_events_dir: Optional[str],
|
||||
) -> None:
|
||||
self.name = name
|
||||
self.source = source
|
||||
self.transforms = transforms
|
||||
self.action = action
|
||||
|
||||
if retry_count is not None:
|
||||
self._retry_count = retry_count
|
||||
if failure_mode is not None:
|
||||
self._failure_mode = failure_mode
|
||||
if failed_events_dir is not None:
|
||||
self._failed_events_dir = failed_events_dir
|
||||
self._init_failed_events_dir()
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict) -> "Pipeline":
|
||||
# Bind config
|
||||
config = PipelineConfig.parse_obj(config_dict)
|
||||
|
||||
if not config.enabled:
|
||||
raise Exception(
|
||||
"Pipeline is disabled, but create method was called unexpectedly."
|
||||
)
|
||||
|
||||
# Create Context
|
||||
ctx = create_action_context(config.name, config.datahub)
|
||||
|
||||
# Create Event Source
|
||||
event_source = create_event_source(config.source, ctx)
|
||||
|
||||
# Create Transforms
|
||||
transforms = []
|
||||
if config.filter is not None:
|
||||
transforms.append(create_filter_transformer(config.filter, ctx))
|
||||
|
||||
if config.transform is not None:
|
||||
for transform_config in config.transform:
|
||||
transforms.append(create_transformer(transform_config, ctx))
|
||||
|
||||
# Create Action
|
||||
action = create_action(config.action, ctx)
|
||||
|
||||
# Finally, create Pipeline.
|
||||
return cls(
|
||||
config.name,
|
||||
event_source,
|
||||
transforms,
|
||||
action,
|
||||
config.options.retry_count if config.options else None,
|
||||
config.options.failure_mode if config.options else None,
|
||||
config.options.failed_events_dir if config.options else None,
|
||||
)
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the action pipeline asynchronously. This method is non-blocking.
|
||||
"""
|
||||
self.run()
|
||||
|
||||
def run(self) -> None:
|
||||
"""
|
||||
Run the action pipeline synchronously. This method is blocking.
|
||||
Raises an instance of PipelineException if an unrecoverable pipeline failure occurs.
|
||||
"""
|
||||
self._stats.mark_start()
|
||||
|
||||
# First, source the events.
|
||||
enveloped_events = self.source.events()
|
||||
for enveloped_event in enveloped_events:
|
||||
# Then, process the event.
|
||||
retval = self._process_event(enveloped_event)
|
||||
|
||||
# For legacy users w/o selective ack support, convert
|
||||
# None to True, i.e. always commit.
|
||||
if retval is None:
|
||||
retval = True
|
||||
|
||||
# Finally, ack the event.
|
||||
self._ack_event(enveloped_event, retval)
|
||||
|
||||
def stop(self) -> None:
|
||||
"""
|
||||
Stops a running action pipeline.
|
||||
"""
|
||||
logger.debug(f"Preparing to stop Actions Pipeline with name {self.name}")
|
||||
self._shutdown = True
|
||||
self._failed_events_fd.close()
|
||||
self.source.close()
|
||||
self.action.close()
|
||||
|
||||
def stats(self) -> PipelineStats:
|
||||
"""
|
||||
Returns basic statistics about the Pipeline run.
|
||||
"""
|
||||
return self._stats
|
||||
|
||||
def _process_event(self, enveloped_event: EventEnvelope) -> Optional[bool]:
|
||||
# Attempt to process the incoming event, with retry.
|
||||
curr_attempt = 1
|
||||
max_attempts = self._retry_count + 1
|
||||
retval = None
|
||||
while curr_attempt <= max_attempts:
|
||||
try:
|
||||
# First, transform the event.
|
||||
transformed_event = self._execute_transformers(enveloped_event)
|
||||
|
||||
# Then, invoke the action if the event is non-null.
|
||||
if transformed_event is not None:
|
||||
retval = self._execute_action(transformed_event)
|
||||
|
||||
# Short circuit - processing has succeeded.
|
||||
return retval
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Caught exception while attempting to process event. Attempt {curr_attempt}/{max_attempts} event type: {enveloped_event.event_type}, pipeline name: {self.name}"
|
||||
)
|
||||
curr_attempt = curr_attempt + 1
|
||||
|
||||
logger.error(
|
||||
f"Failed to process event after {self._retry_count} retries. event type: {enveloped_event.event_type}, pipeline name: {self.name}. Handling failure..."
|
||||
)
|
||||
|
||||
# Increment failed event count.
|
||||
self._stats.increment_failed_event_count()
|
||||
|
||||
# Finally, handle the failure
|
||||
self._handle_failure(enveloped_event)
|
||||
|
||||
return retval
|
||||
|
||||
def _execute_transformers(
|
||||
self, enveloped_event: EventEnvelope
|
||||
) -> Optional[EventEnvelope]:
|
||||
curr_event = enveloped_event
|
||||
# Iterate through all transformers, sequentially apply them to the result of the previous.
|
||||
for transformer in self.transforms:
|
||||
# Increment stats
|
||||
self._stats.increment_transformer_processed_count(transformer)
|
||||
|
||||
# Transform the event
|
||||
transformed_event = self._execute_transformer(curr_event, transformer)
|
||||
|
||||
# Process result
|
||||
if transformed_event is None:
|
||||
# If the transformer has filtered the event, short circuit.
|
||||
self._stats.increment_transformer_filtered_count(transformer)
|
||||
return None
|
||||
# Otherwise, set the result to the transformed event.
|
||||
curr_event = transformed_event
|
||||
|
||||
# Return the final transformed event.
|
||||
return curr_event
|
||||
|
||||
def _execute_transformer(
|
||||
self, enveloped_event: EventEnvelope, transformer: Transformer
|
||||
) -> Optional[EventEnvelope]:
|
||||
try:
|
||||
return transformer.transform(enveloped_event)
|
||||
except Exception as e:
|
||||
self._stats.increment_transformer_exception_count(transformer)
|
||||
raise PipelineException(
|
||||
f"Caught exception while executing Transformer with name {type(transformer).__name__}"
|
||||
) from e
|
||||
|
||||
def _execute_action(self, enveloped_event: EventEnvelope) -> Optional[bool]:
|
||||
try:
|
||||
retval = self.action.act(enveloped_event)
|
||||
self._stats.increment_action_success_count()
|
||||
return retval
|
||||
except Exception as e:
|
||||
self._stats.increment_action_exception_count()
|
||||
raise PipelineException(
|
||||
f"Caught exception while executing Action with type {type(self.action).__name__}"
|
||||
) from e
|
||||
|
||||
def _ack_event(self, enveloped_event: EventEnvelope, processed: bool) -> None:
|
||||
try:
|
||||
self.source.ack(enveloped_event, processed)
|
||||
self._stats.increment_success_count()
|
||||
except Exception:
|
||||
self._stats.increment_failed_ack_count()
|
||||
logger.exception(
|
||||
f"Caught exception while attempting to ack successfully processed event. event type: {enveloped_event.event_type}, pipeline name: {self.name}",
|
||||
)
|
||||
logger.debug(f"Failed to ack event: {enveloped_event}")
|
||||
|
||||
def _handle_failure(self, enveloped_event: EventEnvelope) -> None:
|
||||
# First, always save the failed event to a file. Useful for investigation.
|
||||
self._append_failed_event_to_file(enveloped_event)
|
||||
if self._failure_mode == FailureMode.THROW:
|
||||
raise PipelineException("Failed to process event after maximum retries.")
|
||||
elif self._failure_mode == FailureMode.CONTINUE:
|
||||
# Simply return, nothing left to do.
|
||||
pass
|
||||
|
||||
def _append_failed_event_to_file(self, enveloped_event: EventEnvelope) -> None:
|
||||
# First, convert the event to JSON.
|
||||
try:
|
||||
json = enveloped_event.as_json()
|
||||
# Then append to failed events file.
|
||||
self._failed_events_fd.write(json + "\n")
|
||||
self._failed_events_fd.flush()
|
||||
except Exception as e:
|
||||
# This is a serious issue, as if we do not handle it can mean losing an event altogether.
|
||||
# Raise an exception to ensure this issue is reported to the operator.
|
||||
raise PipelineException(
|
||||
f"Failed to log failed event to file! {enveloped_event}"
|
||||
) from e
|
||||
|
||||
def _init_failed_events_dir(self) -> None:
|
||||
# create a directory for failed events from this actions pipeine.
|
||||
failed_events_dir = os.path.join(
|
||||
self._failed_events_dir, normalize_directory_name(self.name)
|
||||
)
|
||||
try:
|
||||
os.makedirs(failed_events_dir, exist_ok=True)
|
||||
|
||||
failed_events_file_name = os.path.join(
|
||||
failed_events_dir, DEFAULT_FAILED_EVENTS_FILE_NAME
|
||||
)
|
||||
self._failed_events_fd = open(failed_events_file_name, "a")
|
||||
except Exception as e:
|
||||
logger.debug(e)
|
||||
raise PipelineException(
|
||||
f"Caught exception while attempting to create failed events log file at path {failed_events_dir}. Please check your file system permissions."
|
||||
) from e
|
@ -0,0 +1,74 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from datahub.configuration import ConfigModel
|
||||
from datahub.configuration.common import ConfigEnum
|
||||
from datahub.ingestion.graph.client import DatahubClientConfig
|
||||
|
||||
|
||||
class FailureMode(ConfigEnum):
|
||||
# Log the failed event to the failed events log. Then throw an pipeline exception to stop the pipeline.
|
||||
THROW = "THROW"
|
||||
# Log the failed event to the failed events log. Then continue processing the event stream.
|
||||
CONTINUE = "CONTINUE"
|
||||
|
||||
|
||||
class SourceConfig(ConfigModel):
|
||||
type: str
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class TransformConfig(ConfigModel):
|
||||
type: str
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class FilterConfig(ConfigModel):
|
||||
event_type: Union[str, List[str]]
|
||||
event: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class ActionConfig(ConfigModel):
|
||||
type: str
|
||||
config: Optional[dict]
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
retry_count: Optional[int] = None
|
||||
failure_mode: Optional[FailureMode] = None
|
||||
failed_events_dir: Optional[str] = (
|
||||
None # The path where failed events should be logged.
|
||||
)
|
||||
|
||||
|
||||
class PipelineConfig(ConfigModel):
|
||||
"""
|
||||
Configuration required to create a new Actions Pipeline.
|
||||
|
||||
This exactly matches the structure of the YAML file used
|
||||
to configure a Pipeline.
|
||||
"""
|
||||
|
||||
name: str
|
||||
enabled: bool = True
|
||||
source: SourceConfig
|
||||
filter: Optional[FilterConfig] = None
|
||||
transform: Optional[List[TransformConfig]] = None
|
||||
action: ActionConfig
|
||||
datahub: Optional[DatahubClientConfig] = None
|
||||
options: Optional[PipelineOptions] = None
|
@ -0,0 +1,31 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from datahub_actions.api.action_graph import AcrylDataHubGraph
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineContext:
|
||||
"""
|
||||
Context which is provided to each component in a Pipeline.
|
||||
"""
|
||||
|
||||
# The name of the running pipeline.
|
||||
pipeline_name: str
|
||||
|
||||
# An instance of a DataHub client.
|
||||
graph: Optional[AcrylDataHubGraph]
|
105
datahub-actions/src/datahub_actions/pipeline/pipeline_manager.py
Normal file
105
datahub-actions/src/datahub_actions/pipeline/pipeline_manager.py
Normal file
@ -0,0 +1,105 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from threading import Thread
|
||||
from typing import Dict
|
||||
|
||||
from datahub_actions.pipeline.pipeline import Pipeline, PipelineException
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineSpec:
|
||||
# The pipeline name
|
||||
name: str
|
||||
|
||||
# The pipeline
|
||||
pipeline: Pipeline
|
||||
|
||||
# The thread which is executing the pipeline.
|
||||
thread: Thread
|
||||
|
||||
|
||||
# Run a pipeline in blocking fashion
|
||||
# TODO: Exit process on failure of single pipeline.
|
||||
def run_pipeline(pipeline: Pipeline) -> None:
|
||||
try:
|
||||
pipeline.run()
|
||||
except PipelineException:
|
||||
logger.error(
|
||||
f"Caught exception while running pipeline with name {pipeline.name}: {traceback.format_exc(limit=3)}"
|
||||
)
|
||||
pipeline.stop()
|
||||
logger.debug(f"Thread for pipeline with name {pipeline.name} has stopped.")
|
||||
|
||||
|
||||
# A manager of multiple Action Pipelines.
|
||||
# This class manages 1 thread per pipeline registered.
|
||||
class PipelineManager:
|
||||
# A catalog of all the currently executing Action Pipelines.
|
||||
pipeline_registry: Dict[str, PipelineSpec] = {}
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
# Start a new Action Pipeline.
|
||||
def start_pipeline(self, name: str, pipeline: Pipeline) -> None:
|
||||
logger.debug(f"Attempting to start pipeline with name {name}...")
|
||||
if name not in self.pipeline_registry:
|
||||
thread = Thread(target=run_pipeline, args=([pipeline]))
|
||||
thread.start()
|
||||
spec = PipelineSpec(name, pipeline, thread)
|
||||
self.pipeline_registry[name] = spec
|
||||
logger.debug(f"Started pipeline with name {name}.")
|
||||
else:
|
||||
raise Exception(f"Pipeline with name {name} is already running.")
|
||||
|
||||
# Stop a running Action Pipeline.
|
||||
def stop_pipeline(self, name: str) -> None:
|
||||
logger.debug(f"Attempting to stop pipeline with name {name}...")
|
||||
if name in self.pipeline_registry:
|
||||
# First, stop the pipeline.
|
||||
try:
|
||||
pipeline_spec = self.pipeline_registry[name]
|
||||
pipeline_spec.pipeline.stop()
|
||||
pipeline_spec.thread.join() # Wait for the pipeline thread to terminate.
|
||||
logger.info(f"Actions Pipeline with name '{name}' has been stopped.")
|
||||
pipeline_spec.pipeline.stats().pretty_print_summary(
|
||||
name
|
||||
) # Print the pipeline's statistics.
|
||||
del self.pipeline_registry[name]
|
||||
except Exception as e:
|
||||
# Failed to stop a pipeline - this is a critical issue, we should avoid starting another action of the same type
|
||||
# until this pipeline is confirmed killed.
|
||||
logger.error(
|
||||
f"Caught exception while attempting to stop pipeline with name {name}: {traceback.format_exc(limit=3)}"
|
||||
)
|
||||
raise Exception(
|
||||
f"Caught exception while attempting to stop pipeline with name {name}."
|
||||
) from e
|
||||
else:
|
||||
raise Exception(f"No pipeline with name {name} found.")
|
||||
|
||||
# Stop all running pipelines.
|
||||
def stop_all(self) -> None:
|
||||
logger.debug("Attempting to stop all running pipelines...")
|
||||
# Stop each running pipeline.
|
||||
names = list(self.pipeline_registry.keys()).copy()
|
||||
for name in names:
|
||||
self.stop_pipeline(name)
|
||||
logger.debug("Successfully stop all running pipelines.")
|
131
datahub-actions/src/datahub_actions/pipeline/pipeline_stats.py
Normal file
131
datahub-actions/src/datahub_actions/pipeline/pipeline_stats.py
Normal file
@ -0,0 +1,131 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import datetime
|
||||
import json
|
||||
from time import time
|
||||
from typing import Dict
|
||||
|
||||
import click
|
||||
|
||||
from datahub_actions.action.action_stats import ActionStats
|
||||
from datahub_actions.pipeline.pipeline_util import get_transformer_name
|
||||
from datahub_actions.transform.transformer import Transformer
|
||||
from datahub_actions.transform.transformer_stats import TransformerStats
|
||||
|
||||
|
||||
# Class that stores running statistics for a single Actions Pipeline.
|
||||
class PipelineStats:
|
||||
# Timestamp in milliseconds when the pipeline was launched.
|
||||
started_at: int
|
||||
|
||||
# Number of events that failed processing even after retry.
|
||||
failed_event_count: int = 0
|
||||
|
||||
# Number of events that failed when "ack" was invoked.
|
||||
failed_ack_count: int = 0
|
||||
|
||||
# Top-level number of succeeded processing executions.
|
||||
success_count: int = 0
|
||||
|
||||
# Transformer Stats
|
||||
transformer_stats: Dict[str, TransformerStats] = {}
|
||||
|
||||
# Action Stats
|
||||
action_stats: ActionStats = ActionStats()
|
||||
|
||||
def mark_start(self) -> None:
|
||||
self.started_at = int(time() * 1000)
|
||||
|
||||
def increment_failed_event_count(self) -> None:
|
||||
self.failed_event_count = self.failed_event_count + 1
|
||||
|
||||
def increment_failed_ack_count(self) -> None:
|
||||
self.failed_ack_count = self.failed_ack_count + 1
|
||||
|
||||
def increment_success_count(self) -> None:
|
||||
self.success_count = self.success_count + 1
|
||||
|
||||
def increment_transformer_exception_count(self, transformer: Transformer) -> None:
|
||||
transformer_name = get_transformer_name(transformer)
|
||||
if transformer_name not in self.transformer_stats:
|
||||
self.transformer_stats[transformer_name] = TransformerStats()
|
||||
self.transformer_stats[transformer_name].increment_exception_count()
|
||||
|
||||
def increment_transformer_processed_count(self, transformer: Transformer) -> None:
|
||||
transformer_name = get_transformer_name(transformer)
|
||||
if transformer_name not in self.transformer_stats:
|
||||
self.transformer_stats[transformer_name] = TransformerStats()
|
||||
self.transformer_stats[transformer_name].increment_processed_count()
|
||||
|
||||
def increment_transformer_filtered_count(self, transformer: Transformer) -> None:
|
||||
transformer_name = get_transformer_name(transformer)
|
||||
if transformer_name not in self.transformer_stats:
|
||||
self.transformer_stats[transformer_name] = TransformerStats()
|
||||
self.transformer_stats[transformer_name].increment_filtered_count()
|
||||
|
||||
def increment_action_exception_count(self) -> None:
|
||||
self.action_stats.increment_exception_count()
|
||||
|
||||
def increment_action_success_count(self) -> None:
|
||||
self.action_stats.increment_success_count()
|
||||
|
||||
def get_started_at(self) -> int:
|
||||
return self.started_at
|
||||
|
||||
def get_failed_event_count(self) -> int:
|
||||
return self.failed_event_count
|
||||
|
||||
def get_failed_ack_count(self) -> int:
|
||||
return self.failed_ack_count
|
||||
|
||||
def get_success_count(self) -> int:
|
||||
return self.success_count
|
||||
|
||||
def get_transformer_stats(self, transformer: Transformer) -> TransformerStats:
|
||||
transformer_name = get_transformer_name(transformer)
|
||||
if transformer_name not in self.transformer_stats:
|
||||
self.transformer_stats[transformer_name] = TransformerStats()
|
||||
return self.transformer_stats[transformer_name]
|
||||
|
||||
def get_action_stats(self) -> ActionStats:
|
||||
return self.action_stats
|
||||
|
||||
def as_string(self) -> str:
|
||||
return json.dumps(self.__dict__, indent=4, sort_keys=True)
|
||||
|
||||
def pretty_print_summary(self, name: str) -> None:
|
||||
curr_time = int(time() * 1000)
|
||||
click.echo()
|
||||
click.secho(f"Pipeline Report for {name}", bold=True, fg="blue")
|
||||
click.echo()
|
||||
click.echo(
|
||||
f"Started at: {datetime.datetime.fromtimestamp(self.started_at / 1000.0)} (Local Time)"
|
||||
)
|
||||
click.echo(f"Duration: {(curr_time - self.started_at) / 1000.0}s")
|
||||
click.echo()
|
||||
click.secho("Pipeline statistics", bold=True)
|
||||
click.echo()
|
||||
click.echo(self.as_string())
|
||||
click.echo()
|
||||
if len(self.transformer_stats.keys()) > 0:
|
||||
click.secho("Transformer statistics", bold=True)
|
||||
for key in self.transformer_stats:
|
||||
click.echo()
|
||||
click.echo(f"{key}: {self.transformer_stats[key].as_string()}")
|
||||
click.echo()
|
||||
click.secho("Action statistics", bold=True)
|
||||
click.echo()
|
||||
click.echo(self.action_stats.as_string())
|
||||
click.echo()
|
156
datahub-actions/src/datahub_actions/pipeline/pipeline_util.py
Normal file
156
datahub-actions/src/datahub_actions/pipeline/pipeline_util.py
Normal file
@ -0,0 +1,156 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.action.action_registry import action_registry
|
||||
from datahub_actions.api.action_graph import AcrylDataHubGraph
|
||||
from datahub_actions.pipeline.pipeline_config import (
|
||||
ActionConfig,
|
||||
FilterConfig,
|
||||
SourceConfig,
|
||||
TransformConfig,
|
||||
)
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
from datahub_actions.plugin.transform.filter.filter_transformer import (
|
||||
FilterTransformer,
|
||||
FilterTransformerConfig,
|
||||
)
|
||||
from datahub_actions.source.event_source import EventSource
|
||||
from datahub_actions.source.event_source_registry import event_source_registry
|
||||
from datahub_actions.transform.transformer import Transformer
|
||||
from datahub_actions.transform.transformer_registry import transformer_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_action_context(
|
||||
pipeline_name: str, datahub_config: Optional[DatahubClientConfig]
|
||||
) -> PipelineContext:
|
||||
return PipelineContext(
|
||||
pipeline_name,
|
||||
(
|
||||
AcrylDataHubGraph(DataHubGraph(datahub_config))
|
||||
if datahub_config is not None
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def create_event_source(
|
||||
source_config: SourceConfig, ctx: PipelineContext
|
||||
) -> EventSource:
|
||||
event_source_type = source_config.type
|
||||
event_source_class = event_source_registry.get(event_source_type)
|
||||
event_source_instance = None
|
||||
try:
|
||||
logger.debug(
|
||||
f"Attempting to instantiate new Event Source of type {source_config.type}.."
|
||||
)
|
||||
event_source_config = (
|
||||
source_config.config if source_config.config is not None else {}
|
||||
)
|
||||
event_source_instance = event_source_class.create(event_source_config, ctx)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Caught exception while attempting to instantiate Event Source of type {source_config.type}"
|
||||
) from e
|
||||
|
||||
if event_source_instance is None:
|
||||
raise Exception(
|
||||
f"Failed to create Event Source with type {event_source_type}. Event Source create method returned 'None'."
|
||||
)
|
||||
|
||||
return event_source_instance
|
||||
|
||||
|
||||
def create_filter_transformer(
|
||||
filter_config: FilterConfig, ctx: PipelineContext
|
||||
) -> Transformer:
|
||||
try:
|
||||
logger.debug("Attempting to instantiate filter transformer..")
|
||||
filter_transformer_config = FilterTransformerConfig(
|
||||
event_type=filter_config.event_type, event=filter_config.event
|
||||
)
|
||||
return FilterTransformer(filter_transformer_config)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
"Caught exception while attempting to instantiate Filter transformer"
|
||||
) from e
|
||||
|
||||
|
||||
def create_transformer(
|
||||
transform_config: TransformConfig, ctx: PipelineContext
|
||||
) -> Transformer:
|
||||
transformer_type = transform_config.type
|
||||
transformer_class = transformer_registry.get(transformer_type)
|
||||
transformer_instance = None
|
||||
try:
|
||||
logger.debug(
|
||||
f"Attempting to instantiate new Transformer of type {transform_config.type}.."
|
||||
)
|
||||
transformer_config = (
|
||||
transform_config.config if transform_config.config is not None else {}
|
||||
)
|
||||
transformer_instance = transformer_class.create(transformer_config, ctx)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Caught exception while attempting to instantiate Transformer with type {transformer_type}"
|
||||
) from e
|
||||
|
||||
if transformer_instance is None:
|
||||
raise Exception(
|
||||
f"Failed to create transformer with type {transformer_type}. Transformer create method returned 'None'."
|
||||
)
|
||||
|
||||
return transformer_instance
|
||||
|
||||
|
||||
def create_action(action_config: ActionConfig, ctx: PipelineContext) -> Action:
|
||||
action_type = action_config.type
|
||||
action_instance = None
|
||||
try:
|
||||
logger.debug(
|
||||
f"Attempting to instantiate new Action of type {action_config.type}.."
|
||||
)
|
||||
action_class = action_registry.get(action_type)
|
||||
action_config_dict = (
|
||||
action_config.config if action_config.config is not None else {}
|
||||
)
|
||||
action_instance = action_class.create(action_config_dict, ctx)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Caught exception while attempting to instantiate Action with type {action_type}. "
|
||||
) from e
|
||||
|
||||
if action_instance is None:
|
||||
raise Exception(
|
||||
f"Failed to create action with type {action_type}. Action create method returned 'None'."
|
||||
)
|
||||
|
||||
return action_instance
|
||||
|
||||
|
||||
def normalize_directory_name(name: str) -> str:
|
||||
# Lower case & remove whitespaces + periods.
|
||||
return re.sub(r"[^\w\-_]", "_", name.lower())
|
||||
|
||||
|
||||
def get_transformer_name(transformer: Transformer) -> str:
|
||||
# TODO: Would be better to compute this using the transformer registry itself.
|
||||
return type(transformer).__name__
|
13
datahub-actions/src/datahub_actions/plugin/__init__.py
Normal file
13
datahub-actions/src/datahub_actions/plugin/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,219 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from typing import Any, List, Optional, cast
|
||||
|
||||
from acryl.executor.dispatcher.default_dispatcher import DefaultDispatcher
|
||||
from acryl.executor.execution.reporting_executor import (
|
||||
ReportingExecutor,
|
||||
ReportingExecutorConfig,
|
||||
)
|
||||
from acryl.executor.execution.task import TaskConfig
|
||||
from acryl.executor.request.execution_request import ExecutionRequest
|
||||
from acryl.executor.request.signal_request import SignalRequest
|
||||
from acryl.executor.secret.datahub_secret_store import DataHubSecretStoreConfig
|
||||
from acryl.executor.secret.secret_store import SecretStoreConfig
|
||||
from pydantic import BaseModel
|
||||
|
||||
from datahub.metadata.schema_classes import MetadataChangeLogClass
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.event.event_registry import METADATA_CHANGE_LOG_EVENT_V1_TYPE
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATAHUB_EXECUTION_REQUEST_ENTITY_NAME = "dataHubExecutionRequest"
|
||||
DATAHUB_EXECUTION_REQUEST_INPUT_ASPECT_NAME = "dataHubExecutionRequestInput"
|
||||
DATAHUB_EXECUTION_REQUEST_SIGNAL_ASPECT_NAME = "dataHubExecutionRequestSignal"
|
||||
APPLICATION_JSON_CONTENT_TYPE = "application/json"
|
||||
|
||||
|
||||
def _is_importable(path: str) -> bool:
|
||||
return "." in path or ":" in path
|
||||
|
||||
|
||||
def import_path(path: str) -> Any:
|
||||
"""
|
||||
Import an item from a package, where the path is formatted as 'package.module.submodule.ClassName'
|
||||
or 'package.module.submodule:ClassName.classmethod'. The dot-based format assumes that the bit
|
||||
after the last dot is the item to be fetched. In cases where the item to be imported is embedded
|
||||
within another type, the colon-based syntax can be used to disambiguate.
|
||||
"""
|
||||
assert _is_importable(path), "path must be in the appropriate format"
|
||||
|
||||
if ":" in path:
|
||||
module_name, object_name = path.rsplit(":", 1)
|
||||
else:
|
||||
module_name, object_name = path.rsplit(".", 1)
|
||||
|
||||
item = importlib.import_module(module_name)
|
||||
for attr in object_name.split("."):
|
||||
item = getattr(item, attr)
|
||||
return item
|
||||
|
||||
|
||||
class ExecutorConfig(BaseModel):
|
||||
executor_id: Optional[str] = None
|
||||
task_configs: Optional[List[TaskConfig]] = None
|
||||
|
||||
|
||||
# Listens to new Execution Requests & dispatches them to the appropriate handler.
|
||||
class ExecutorAction(Action):
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
config = ExecutorConfig.parse_obj(config_dict or {})
|
||||
return cls(config, ctx)
|
||||
|
||||
def __init__(self, config: ExecutorConfig, ctx: PipelineContext):
|
||||
self.ctx = ctx
|
||||
|
||||
executors = []
|
||||
|
||||
executor_config = self._build_executor_config(config, ctx)
|
||||
executors.append(ReportingExecutor(executor_config))
|
||||
|
||||
# Construct execution request dispatcher
|
||||
self.dispatcher = DefaultDispatcher(executors)
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
"""This method listens for ExecutionRequest changes to execute in schedule and trigger events"""
|
||||
if event.event_type is METADATA_CHANGE_LOG_EVENT_V1_TYPE:
|
||||
orig_event = cast(MetadataChangeLogClass, event.event)
|
||||
if (
|
||||
orig_event.get("entityType") == DATAHUB_EXECUTION_REQUEST_ENTITY_NAME
|
||||
and orig_event.get("changeType") == "UPSERT"
|
||||
):
|
||||
if (
|
||||
orig_event.get("aspectName")
|
||||
== DATAHUB_EXECUTION_REQUEST_INPUT_ASPECT_NAME
|
||||
):
|
||||
logger.debug("Received execution request input. Processing...")
|
||||
self._handle_execution_request_input(orig_event)
|
||||
elif (
|
||||
orig_event.get("aspectName")
|
||||
== DATAHUB_EXECUTION_REQUEST_SIGNAL_ASPECT_NAME
|
||||
):
|
||||
logger.debug("Received execution request signal. Processing...")
|
||||
self._handle_execution_request_signal(orig_event)
|
||||
|
||||
def _handle_execution_request_input(self, orig_event):
|
||||
entity_urn = orig_event.get("entityUrn")
|
||||
entity_key = orig_event.get("entityKeyAspect")
|
||||
|
||||
# Get the run id to use.
|
||||
exec_request_id = None
|
||||
if entity_key is not None:
|
||||
exec_request_key = json.loads(
|
||||
entity_key.get("value")
|
||||
) # this becomes the run id.
|
||||
exec_request_id = exec_request_key.get("id")
|
||||
elif entity_urn is not None:
|
||||
urn_parts = entity_urn.split(":")
|
||||
exec_request_id = urn_parts[len(urn_parts) - 1]
|
||||
|
||||
# Decode the aspect json into something more readable :)
|
||||
exec_request_input = json.loads(orig_event.get("aspect").get("value"))
|
||||
|
||||
# Build an Execution Request
|
||||
exec_request = ExecutionRequest(
|
||||
executor_id=exec_request_input.get("executorId"),
|
||||
exec_id=exec_request_id,
|
||||
name=exec_request_input.get("task"),
|
||||
args=exec_request_input.get("args"),
|
||||
)
|
||||
|
||||
# Try to dispatch the execution request
|
||||
try:
|
||||
self.dispatcher.dispatch(exec_request)
|
||||
except Exception:
|
||||
logger.error("ERROR", exc_info=sys.exc_info())
|
||||
|
||||
def _handle_execution_request_signal(self, orig_event):
|
||||
entity_urn = orig_event.get("entityUrn")
|
||||
|
||||
if (
|
||||
orig_event.get("aspect").get("contentType") == APPLICATION_JSON_CONTENT_TYPE
|
||||
and entity_urn is not None
|
||||
):
|
||||
# Decode the aspect json into something more readable :)
|
||||
signal_request_input = json.loads(orig_event.get("aspect").get("value"))
|
||||
|
||||
# Build a Signal Request
|
||||
urn_parts = entity_urn.split(":")
|
||||
exec_id = urn_parts[len(urn_parts) - 1]
|
||||
signal_request = SignalRequest(
|
||||
executor_id=signal_request_input.get("executorId"),
|
||||
exec_id=exec_id,
|
||||
signal=signal_request_input.get("signal"),
|
||||
)
|
||||
|
||||
# Try to dispatch the signal request
|
||||
try:
|
||||
self.dispatcher.dispatch_signal(signal_request)
|
||||
except Exception:
|
||||
logger.error("ERROR", exc_info=sys.exc_info())
|
||||
|
||||
def _build_executor_config(
|
||||
self, config: ExecutorConfig, ctx: PipelineContext
|
||||
) -> ReportingExecutorConfig:
|
||||
if config.task_configs:
|
||||
task_configs = config.task_configs
|
||||
else:
|
||||
# Build default task config
|
||||
task_configs = [
|
||||
TaskConfig(
|
||||
name="RUN_INGEST",
|
||||
type="acryl.executor.execution.sub_process_ingestion_task.SubProcessIngestionTask",
|
||||
configs=dict({}),
|
||||
),
|
||||
TaskConfig(
|
||||
name="TEST_CONNECTION",
|
||||
type="acryl.executor.execution.sub_process_test_connection_task.SubProcessTestConnectionTask",
|
||||
configs={},
|
||||
),
|
||||
]
|
||||
|
||||
if not ctx.graph:
|
||||
raise Exception(
|
||||
"Invalid configuration provided to action. DataHub Graph Client Required. Try including the 'datahub' block in your configuration."
|
||||
)
|
||||
|
||||
graph = ctx.graph.graph
|
||||
|
||||
# Build default executor config
|
||||
local_executor_config = ReportingExecutorConfig(
|
||||
id=config.executor_id or "default",
|
||||
task_configs=task_configs,
|
||||
secret_stores=[
|
||||
SecretStoreConfig(type="env", config=dict({})),
|
||||
SecretStoreConfig(
|
||||
type="datahub",
|
||||
# TODO: Once SecretStoreConfig is updated to accept arbitrary types
|
||||
# and not just dicts, we can just pass in the DataHubSecretStoreConfig
|
||||
# object directly.
|
||||
config=DataHubSecretStoreConfig(graph_client=graph).dict(),
|
||||
),
|
||||
],
|
||||
graph_client=graph,
|
||||
)
|
||||
|
||||
return local_executor_config
|
||||
|
||||
def close(self) -> None:
|
||||
# TODO: Handle closing action ingestion processing.
|
||||
pass
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,52 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HelloWorldConfig(BaseModel):
|
||||
# Whether to print the message in upper case.
|
||||
to_upper: Optional[bool] = None
|
||||
|
||||
|
||||
# A basic example of a DataHub action that prints all
|
||||
# events received to the console.
|
||||
class HelloWorldAction(Action):
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
action_config = HelloWorldConfig.parse_obj(config_dict or {})
|
||||
return cls(action_config, ctx)
|
||||
|
||||
def __init__(self, config: HelloWorldConfig, ctx: PipelineContext):
|
||||
self.config = config
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
print("Hello world! Received event:")
|
||||
message = json.dumps(json.loads(event.as_json()), indent=4)
|
||||
if self.config.to_upper:
|
||||
print(message.upper())
|
||||
else:
|
||||
print(message)
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
@ -0,0 +1,54 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any, Callable
|
||||
|
||||
from datahub.metadata.schema_classes import MetadataChangeLogClass
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.event.event_registry import METADATA_CHANGE_LOG_EVENT_V1_TYPE
|
||||
|
||||
|
||||
class MCLProcessor:
|
||||
"""
|
||||
A utility class to register and process MetadataChangeLog events.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.entity_aspect_processors: dict[str, dict[str, Callable]] = {}
|
||||
pass
|
||||
|
||||
def is_mcl(self, event: EventEnvelope) -> bool:
|
||||
return event.event_type is METADATA_CHANGE_LOG_EVENT_V1_TYPE
|
||||
|
||||
def register_processor(
|
||||
self, entity_type: str, aspect: str, processor: Callable
|
||||
) -> None:
|
||||
if entity_type not in self.entity_aspect_processors:
|
||||
self.entity_aspect_processors[entity_type] = {}
|
||||
self.entity_aspect_processors[entity_type][aspect] = processor
|
||||
|
||||
def process(self, event: EventEnvelope) -> Any:
|
||||
if isinstance(event.event, MetadataChangeLogClass):
|
||||
entity_type = event.event.entityType
|
||||
aspect = event.event.aspectName
|
||||
if (
|
||||
entity_type in self.entity_aspect_processors
|
||||
and aspect in self.entity_aspect_processors[entity_type]
|
||||
):
|
||||
return self.entity_aspect_processors[entity_type][aspect](
|
||||
entity_urn=event.event.entityUrn,
|
||||
aspect_name=event.event.aspectName,
|
||||
aspect_value=event.event.aspect,
|
||||
previous_aspect_value=event.event.previousAspectValue,
|
||||
)
|
@ -0,0 +1,169 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, List, Optional, Set, Union, cast
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from datahub.emitter.rest_emitter import DatahubRestEmitter
|
||||
from datahub.metadata.schema_classes import (
|
||||
ChangeTypeClass,
|
||||
MetadataChangeLogClass,
|
||||
MetadataChangeProposalClass,
|
||||
)
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.event.event_registry import METADATA_CHANGE_LOG_EVENT_V1_TYPE
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetadataChangeEmitterConfig(BaseModel):
|
||||
gms_server: Optional[str] = None
|
||||
gms_auth_token: Optional[str] = None
|
||||
aspects_to_exclude: Optional[List] = None
|
||||
aspects_to_include: Optional[List] = None
|
||||
entity_type_to_exclude: List[str] = Field(default_factory=list)
|
||||
extra_headers: Optional[Dict[str, str]] = None
|
||||
urn_regex: Optional[str] = None
|
||||
|
||||
|
||||
class MetadataChangeSyncAction(Action):
|
||||
rest_emitter: DatahubRestEmitter
|
||||
aspects_exclude_set: Set
|
||||
# By default, we exclude the following aspects since different datahub instances have their own encryption keys for
|
||||
# encrypting tokens and secrets, we can't decrypt them even if these values sync to another datahub instance
|
||||
# also, we don't sync execution request aspects because the ingestion recipe might contain datahub secret
|
||||
# that another datahub instance could not decrypt
|
||||
DEFAULT_ASPECTS_EXCLUDE_SET = {
|
||||
"dataHubAccessTokenInfo",
|
||||
"dataHubAccessTokenKey",
|
||||
"dataHubSecretKey",
|
||||
"dataHubSecretValue",
|
||||
"dataHubExecutionRequestInput",
|
||||
"dataHubExecutionRequestKey",
|
||||
"dataHubExecutionRequestResult",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
action_config = MetadataChangeEmitterConfig.parse_obj(config_dict or {})
|
||||
return cls(action_config, ctx)
|
||||
|
||||
def __init__(self, config: MetadataChangeEmitterConfig, ctx: PipelineContext):
|
||||
self.config = config
|
||||
assert isinstance(self.config.gms_server, str)
|
||||
self.rest_emitter = DatahubRestEmitter(
|
||||
gms_server=self.config.gms_server,
|
||||
token=self.config.gms_auth_token,
|
||||
extra_headers=self.config.extra_headers,
|
||||
)
|
||||
self.aspects_exclude_set = (
|
||||
self.DEFAULT_ASPECTS_EXCLUDE_SET.union(set(self.config.aspects_to_exclude))
|
||||
if self.config.aspects_to_exclude
|
||||
else self.DEFAULT_ASPECTS_EXCLUDE_SET
|
||||
)
|
||||
self.aspects_include_set = self.config.aspects_to_include
|
||||
|
||||
extra_headers_keys = (
|
||||
list(self.config.extra_headers.keys())
|
||||
if self.config.extra_headers
|
||||
else None
|
||||
)
|
||||
logger.info(
|
||||
f"MetadataChangeSyncAction configured to emit mcp to gms server {self.config.gms_server} with extra headers {extra_headers_keys} and aspects to exclude {self.aspects_exclude_set} and aspects to include {self.aspects_include_set}"
|
||||
)
|
||||
self.urn_regex = self.config.urn_regex
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
"""
|
||||
This method listens for MetadataChangeLog events, casts it to MetadataChangeProposal,
|
||||
and emits it to another datahub instance
|
||||
"""
|
||||
# MetadataChangeProposal only supports UPSERT type for now
|
||||
if event.event_type is METADATA_CHANGE_LOG_EVENT_V1_TYPE:
|
||||
orig_event = cast(MetadataChangeLogClass, event.event)
|
||||
logger.debug(f"received orig_event {orig_event}")
|
||||
regexUrn = self.urn_regex
|
||||
if regexUrn is None:
|
||||
urn_match = re.match(".*", "default match")
|
||||
elif orig_event.entityUrn is not None:
|
||||
urn_match = re.match(regexUrn, orig_event.entityUrn)
|
||||
else:
|
||||
logger.warning(f"event missing entityUrn: {orig_event}")
|
||||
urn_match = None
|
||||
aspect_name = orig_event.get("aspectName")
|
||||
logger.info(f"urn_match {urn_match} for entityUrn {orig_event.entityUrn}")
|
||||
if (
|
||||
(
|
||||
(
|
||||
self.aspects_include_set is not None
|
||||
and aspect_name in self.aspects_include_set
|
||||
)
|
||||
or (
|
||||
self.aspects_include_set is None
|
||||
and aspect_name not in self.aspects_exclude_set
|
||||
)
|
||||
)
|
||||
and (
|
||||
orig_event.get("entityType")
|
||||
not in self.config.entity_type_to_exclude
|
||||
if self.config.entity_type_to_exclude
|
||||
else True
|
||||
)
|
||||
and urn_match is not None
|
||||
):
|
||||
mcp = self.buildMcp(orig_event)
|
||||
|
||||
if mcp is not None:
|
||||
logger.debug(f"built mcp {mcp}")
|
||||
self.emit(mcp)
|
||||
else:
|
||||
logger.debug(
|
||||
f"skip emitting mcp for aspect {orig_event.get('aspectName')} or entityUrn {orig_event.entityUrn} or entity type {orig_event.get('entityType')} on exclude list"
|
||||
)
|
||||
|
||||
def buildMcp(
|
||||
self, orig_event: MetadataChangeLogClass
|
||||
) -> Union[MetadataChangeProposalClass, None]:
|
||||
try:
|
||||
changeType = orig_event.get("changeType")
|
||||
if changeType == ChangeTypeClass.RESTATE or changeType == "RESTATE":
|
||||
changeType = ChangeTypeClass.UPSERT
|
||||
mcp = MetadataChangeProposalClass(
|
||||
entityType=orig_event.get("entityType"),
|
||||
changeType=changeType,
|
||||
entityUrn=orig_event.get("entityUrn"),
|
||||
entityKeyAspect=orig_event.get("entityKeyAspect"),
|
||||
aspectName=orig_event.get("aspectName"),
|
||||
aspect=orig_event.get("aspect"),
|
||||
)
|
||||
return mcp
|
||||
except Exception as ex:
|
||||
logger.error(
|
||||
f"error when building mcp from mcl {json.dumps(orig_event.to_obj(), indent=4)}"
|
||||
)
|
||||
logger.error(f"exception: {ex}")
|
||||
return None
|
||||
|
||||
def emit(self, mcp: MetadataChangeProposalClass) -> None:
|
||||
# Create an emitter to DataHub over REST
|
||||
try:
|
||||
# For unit test purpose, moving test_connection from initialization to here
|
||||
# if rest_emitter.server_config is empty, that means test_connection() has not been called before
|
||||
if not self.rest_emitter.server_config:
|
||||
self.rest_emitter.test_connection()
|
||||
logger.info(
|
||||
f"emitting the mcp: entityType {mcp.entityType}, changeType {mcp.changeType}, urn {mcp.entityUrn}, aspect name {mcp.aspectName}"
|
||||
)
|
||||
self.rest_emitter.emit_mcp(mcp)
|
||||
logger.info("successfully emit the mcp")
|
||||
except Exception as ex:
|
||||
logger.error(
|
||||
f"error when emitting mcp, {json.dumps(mcp.to_obj(), indent=4)}"
|
||||
)
|
||||
logger.error(f"exception: {ex}")
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,847 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from datahub.configuration.common import ConfigEnum
|
||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||
from datahub.metadata.schema_classes import (
|
||||
AuditStampClass,
|
||||
DocumentationAssociationClass,
|
||||
DocumentationClass,
|
||||
EditableSchemaMetadataClass,
|
||||
EntityChangeEventClass as EntityChangeEvent,
|
||||
GenericAspectClass,
|
||||
MetadataAttributionClass,
|
||||
MetadataChangeLogClass,
|
||||
)
|
||||
from datahub.metadata.urns import DatasetUrn
|
||||
from datahub.utilities.urns.urn import Urn, guess_entity_type
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.api.action_graph import AcrylDataHubGraph
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
from datahub_actions.plugin.action.mcl_utils import MCLProcessor
|
||||
from datahub_actions.plugin.action.propagation.propagation_utils import (
|
||||
DirectionType,
|
||||
PropagationConfig,
|
||||
PropagationDirective,
|
||||
RelationshipType,
|
||||
SourceDetails,
|
||||
get_unique_siblings,
|
||||
)
|
||||
from datahub_actions.plugin.action.stats_util import (
|
||||
ActionStageReport,
|
||||
EventProcessingStats,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocPropagationDirective(PropagationDirective):
|
||||
doc_string: Optional[str] = Field(
|
||||
default=None, description="Documentation string to be propagated."
|
||||
)
|
||||
|
||||
|
||||
class ColumnPropagationRelationships(ConfigEnum):
|
||||
UPSTREAM = "upstream"
|
||||
DOWNSTREAM = "downstream"
|
||||
SIBLING = "sibling"
|
||||
|
||||
|
||||
class DocPropagationConfig(PropagationConfig):
|
||||
"""
|
||||
Configuration model for documentation propagation.
|
||||
|
||||
Attributes:
|
||||
enabled (bool): Indicates whether documentation propagation is enabled or not. Default is True.
|
||||
columns_enabled (bool): Indicates whether column documentation propagation is enabled or not. Default is True.
|
||||
datasets_enabled (bool): Indicates whether dataset level documentation propagation is enabled or not. Default is False.
|
||||
|
||||
Example:
|
||||
config = DocPropagationConfig(enabled=True)
|
||||
"""
|
||||
|
||||
enabled: bool = Field(
|
||||
True,
|
||||
description="Indicates whether documentation propagation is enabled or not.",
|
||||
)
|
||||
columns_enabled: bool = Field(
|
||||
True,
|
||||
description="Indicates whether column documentation propagation is enabled or not.",
|
||||
)
|
||||
# TODO: Currently this flag does nothing. Datasets are NOT supported for docs propagation.
|
||||
datasets_enabled: bool = Field(
|
||||
False,
|
||||
description="Indicates whether dataset level documentation propagation is enabled or not.",
|
||||
)
|
||||
column_propagation_relationships: List[ColumnPropagationRelationships] = Field(
|
||||
[
|
||||
ColumnPropagationRelationships.SIBLING,
|
||||
ColumnPropagationRelationships.DOWNSTREAM,
|
||||
ColumnPropagationRelationships.UPSTREAM,
|
||||
],
|
||||
description="Relationships for column documentation propagation.",
|
||||
)
|
||||
|
||||
|
||||
def get_field_path(schema_field_urn: str) -> str:
|
||||
urn = Urn.from_string(schema_field_urn)
|
||||
return urn.get_entity_id()[1]
|
||||
|
||||
|
||||
def get_field_doc_from_dataset(
|
||||
graph: AcrylDataHubGraph, dataset_urn: str, schema_field_urn: str
|
||||
) -> Optional[str]:
|
||||
editableSchemaMetadata = graph.graph.get_aspect(
|
||||
dataset_urn, EditableSchemaMetadataClass
|
||||
)
|
||||
if editableSchemaMetadata is not None:
|
||||
if editableSchemaMetadata.editableSchemaFieldInfo is not None:
|
||||
field_info = [
|
||||
x
|
||||
for x in editableSchemaMetadata.editableSchemaFieldInfo
|
||||
if x.fieldPath == get_field_path(schema_field_urn)
|
||||
]
|
||||
if field_info:
|
||||
return field_info[0].description
|
||||
return None
|
||||
|
||||
|
||||
ECE_EVENT_TYPE = "EntityChangeEvent_v1"
|
||||
|
||||
|
||||
class DocPropagationAction(Action):
|
||||
def __init__(self, config: DocPropagationConfig, ctx: PipelineContext):
|
||||
super().__init__()
|
||||
self.action_urn: str
|
||||
if not ctx.pipeline_name.startswith("urn:li:dataHubAction"):
|
||||
self.action_urn = f"urn:li:dataHubAction:{ctx.pipeline_name}"
|
||||
else:
|
||||
self.action_urn = ctx.pipeline_name
|
||||
|
||||
self.config: DocPropagationConfig = config
|
||||
self.last_config_refresh: float = 0
|
||||
self.ctx = ctx
|
||||
self.mcl_processor = MCLProcessor()
|
||||
self.actor_urn = "urn:li:corpuser:__datahub_system"
|
||||
|
||||
self.mcl_processor.register_processor(
|
||||
"schemaField",
|
||||
"documentation",
|
||||
self.process_schema_field_documentation,
|
||||
)
|
||||
self.refresh_config()
|
||||
self._stats = ActionStageReport()
|
||||
self._stats.start()
|
||||
assert self.ctx.graph
|
||||
self._rate_limited_emit_mcp = self.config.get_rate_limited_emit_mcp(
|
||||
self.ctx.graph.graph
|
||||
)
|
||||
|
||||
def name(self) -> str:
|
||||
return "DocPropagator"
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
action_config = DocPropagationConfig.parse_obj(config_dict or {})
|
||||
logger.info(f"Doc Propagation Config action configured with {action_config}")
|
||||
return cls(action_config, ctx)
|
||||
|
||||
def should_stop_propagation(
|
||||
self, source_details: SourceDetails
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
Check if the propagation should be stopped based on the source details.
|
||||
Return result and reason.
|
||||
"""
|
||||
if source_details.propagation_started_at and (
|
||||
int(time.time() * 1000.0) - source_details.propagation_started_at
|
||||
>= self.config.max_propagation_time_millis
|
||||
):
|
||||
return (True, "Propagation time exceeded.")
|
||||
if (
|
||||
source_details.propagation_depth
|
||||
and source_details.propagation_depth >= self.config.max_propagation_depth
|
||||
):
|
||||
return (True, "Propagation depth exceeded.")
|
||||
return False, ""
|
||||
|
||||
def get_propagation_relationships(
|
||||
self, entity_type: str, source_details: Optional[SourceDetails]
|
||||
) -> List[Tuple[RelationshipType, DirectionType]]:
|
||||
possible_relationships = []
|
||||
if entity_type == "schemaField":
|
||||
if (source_details is not None) and (
|
||||
source_details.propagation_relationship
|
||||
and source_details.propagation_direction
|
||||
):
|
||||
restricted_relationship = source_details.propagation_relationship
|
||||
restricted_direction = source_details.propagation_direction
|
||||
else:
|
||||
restricted_relationship = None
|
||||
restricted_direction = None
|
||||
|
||||
for relationship in self.config.column_propagation_relationships:
|
||||
if relationship == ColumnPropagationRelationships.UPSTREAM:
|
||||
if (
|
||||
restricted_relationship == RelationshipType.LINEAGE
|
||||
and restricted_direction == DirectionType.DOWN
|
||||
): # Skip upstream if the propagation has been restricted to downstream
|
||||
continue
|
||||
possible_relationships.append(
|
||||
(RelationshipType.LINEAGE, DirectionType.UP)
|
||||
)
|
||||
elif relationship == ColumnPropagationRelationships.DOWNSTREAM:
|
||||
if (
|
||||
restricted_relationship == RelationshipType.LINEAGE
|
||||
and restricted_direction == DirectionType.UP
|
||||
): # Skip upstream if the propagation has been restricted to downstream
|
||||
continue
|
||||
possible_relationships.append(
|
||||
(RelationshipType.LINEAGE, DirectionType.DOWN)
|
||||
)
|
||||
elif relationship == ColumnPropagationRelationships.SIBLING:
|
||||
possible_relationships.append(
|
||||
(RelationshipType.SIBLING, DirectionType.ALL)
|
||||
)
|
||||
logger.debug(f"Possible relationships: {possible_relationships}")
|
||||
return possible_relationships
|
||||
|
||||
def process_schema_field_documentation(
|
||||
self,
|
||||
entity_urn: str,
|
||||
aspect_name: str,
|
||||
aspect_value: GenericAspectClass,
|
||||
previous_aspect_value: Optional[GenericAspectClass],
|
||||
) -> Optional[DocPropagationDirective]:
|
||||
"""
|
||||
Process changes in the documentation aspect of schemaField entities.
|
||||
Produce a directive to propagate the documentation.
|
||||
Business Logic checks:
|
||||
- If the documentation is sourced by this action, then we propagate
|
||||
it.
|
||||
- If the documentation is not sourced by this action, then we log a
|
||||
warning and propagate it.
|
||||
- If we have exceeded the maximum depth of propagation or maximum
|
||||
time for propagation, then we stop propagation and don't return a directive.
|
||||
"""
|
||||
if (
|
||||
aspect_name != "documentation"
|
||||
or guess_entity_type(entity_urn) != "schemaField"
|
||||
):
|
||||
# not a documentation aspect or not a schemaField entity
|
||||
return None
|
||||
|
||||
logger.debug("Processing 'documentation' MCL")
|
||||
if self.config.columns_enabled:
|
||||
current_docs = DocumentationClass.from_obj(json.loads(aspect_value.value))
|
||||
old_docs = (
|
||||
None
|
||||
if previous_aspect_value is None
|
||||
else DocumentationClass.from_obj(
|
||||
json.loads(previous_aspect_value.value)
|
||||
)
|
||||
)
|
||||
if current_docs.documentations:
|
||||
# get the most recently updated documentation with attribution
|
||||
current_documentation_instance = sorted(
|
||||
[doc for doc in current_docs.documentations if doc.attribution],
|
||||
key=lambda x: x.attribution.time if x.attribution else 0,
|
||||
)[-1]
|
||||
assert current_documentation_instance.attribution
|
||||
if (
|
||||
current_documentation_instance.attribution.source is None
|
||||
or current_documentation_instance.attribution.source
|
||||
!= self.action_urn
|
||||
):
|
||||
logger.warning(
|
||||
f"Documentation is not sourced by this action which is unexpected. Will be propagating for {entity_urn}"
|
||||
)
|
||||
source_details = (
|
||||
(current_documentation_instance.attribution.sourceDetail)
|
||||
if current_documentation_instance.attribution
|
||||
else {}
|
||||
)
|
||||
source_details_parsed: SourceDetails = SourceDetails.parse_obj(
|
||||
source_details
|
||||
)
|
||||
should_stop_propagation, reason = self.should_stop_propagation(
|
||||
source_details_parsed
|
||||
)
|
||||
if should_stop_propagation:
|
||||
logger.warning(f"Stopping propagation for {entity_urn}. {reason}")
|
||||
return None
|
||||
else:
|
||||
logger.debug(f"Propagating documentation for {entity_urn}")
|
||||
propagation_relationships = self.get_propagation_relationships(
|
||||
entity_type="schemaField", source_details=source_details_parsed
|
||||
)
|
||||
origin_entity = (
|
||||
source_details_parsed.origin
|
||||
if source_details_parsed.origin
|
||||
else entity_urn
|
||||
)
|
||||
if old_docs is None or not old_docs.documentations:
|
||||
return DocPropagationDirective(
|
||||
propagate=True,
|
||||
doc_string=current_documentation_instance.documentation,
|
||||
operation="ADD",
|
||||
entity=entity_urn,
|
||||
origin=origin_entity,
|
||||
via=entity_urn,
|
||||
actor=self.actor_urn,
|
||||
propagation_started_at=source_details_parsed.propagation_started_at,
|
||||
propagation_depth=(
|
||||
source_details_parsed.propagation_depth + 1
|
||||
if source_details_parsed.propagation_depth
|
||||
else 1
|
||||
),
|
||||
relationships=propagation_relationships,
|
||||
)
|
||||
else:
|
||||
old_docs_instance = sorted(
|
||||
old_docs.documentations,
|
||||
key=lambda x: x.attribution.time if x.attribution else 0,
|
||||
)[-1]
|
||||
if (
|
||||
current_documentation_instance.documentation
|
||||
!= old_docs_instance.documentation
|
||||
):
|
||||
return DocPropagationDirective(
|
||||
propagate=True,
|
||||
doc_string=current_documentation_instance.documentation,
|
||||
operation="MODIFY",
|
||||
entity=entity_urn,
|
||||
origin=origin_entity,
|
||||
via=entity_urn,
|
||||
actor=self.actor_urn,
|
||||
propagation_started_at=source_details_parsed.propagation_started_at,
|
||||
propagation_depth=(
|
||||
source_details_parsed.propagation_depth + 1
|
||||
if source_details_parsed.propagation_depth
|
||||
else 1
|
||||
),
|
||||
relationships=propagation_relationships,
|
||||
)
|
||||
return None
|
||||
|
||||
def should_propagate(
|
||||
self, event: EventEnvelope
|
||||
) -> Optional[DocPropagationDirective]:
|
||||
if self.mcl_processor.is_mcl(event):
|
||||
return self.mcl_processor.process(event)
|
||||
if event.event_type == "EntityChangeEvent_v1":
|
||||
assert isinstance(event.event, EntityChangeEvent)
|
||||
assert self.ctx.graph is not None
|
||||
semantic_event = event.event
|
||||
if (
|
||||
semantic_event.category == "DOCUMENTATION"
|
||||
and self.config is not None
|
||||
and self.config.enabled
|
||||
):
|
||||
logger.debug("Processing EntityChangeEvent Documentation Change")
|
||||
if self.config.columns_enabled and (
|
||||
semantic_event.entityType == "schemaField"
|
||||
):
|
||||
if semantic_event.parameters:
|
||||
parameters = semantic_event.parameters
|
||||
else:
|
||||
parameters = semantic_event._inner_dict.get(
|
||||
"__parameters_json", {}
|
||||
)
|
||||
doc_string = parameters.get("description")
|
||||
origin = parameters.get("origin")
|
||||
origin = origin or semantic_event.entityUrn
|
||||
via = (
|
||||
semantic_event.entityUrn
|
||||
if origin != semantic_event.entityUrn
|
||||
else None
|
||||
)
|
||||
logger.debug(f"Origin: {origin}")
|
||||
logger.debug(f"Via: {via}")
|
||||
logger.debug(f"Doc string: {doc_string}")
|
||||
logger.debug(f"Semantic event {semantic_event}")
|
||||
if doc_string:
|
||||
return DocPropagationDirective(
|
||||
propagate=True,
|
||||
doc_string=doc_string,
|
||||
operation=semantic_event.operation,
|
||||
entity=semantic_event.entityUrn,
|
||||
origin=origin,
|
||||
via=via, # if origin is set, then via is the entity itself
|
||||
actor=(
|
||||
semantic_event.auditStamp.actor
|
||||
if semantic_event.auditStamp
|
||||
else self.actor_urn
|
||||
),
|
||||
propagation_started_at=int(time.time() * 1000.0),
|
||||
propagation_depth=1, # we start at 1 because this is the first propagation
|
||||
relationships=self.get_propagation_relationships(
|
||||
entity_type="schemaField",
|
||||
source_details=None,
|
||||
),
|
||||
)
|
||||
return None
|
||||
|
||||
def modify_docs_on_columns(
|
||||
self,
|
||||
graph: AcrylDataHubGraph,
|
||||
operation: str,
|
||||
schema_field_urn: str,
|
||||
dataset_urn: str,
|
||||
field_doc: Optional[str],
|
||||
context: SourceDetails,
|
||||
) -> Optional[MetadataChangeProposalWrapper]:
|
||||
if context.origin == schema_field_urn:
|
||||
# No need to propagate to self
|
||||
return None
|
||||
|
||||
try:
|
||||
DatasetUrn.from_string(dataset_urn)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Invalid dataset urn {dataset_urn}. {e}. Skipping documentation propagation."
|
||||
)
|
||||
return None
|
||||
|
||||
auditStamp = AuditStampClass(
|
||||
time=int(time.time() * 1000.0), actor=self.actor_urn
|
||||
)
|
||||
|
||||
source_details = context.for_metadata_attribution()
|
||||
attribution: MetadataAttributionClass = MetadataAttributionClass(
|
||||
source=self.action_urn,
|
||||
time=auditStamp.time,
|
||||
actor=self.actor_urn,
|
||||
sourceDetail=source_details,
|
||||
)
|
||||
documentations = graph.graph.get_aspect(schema_field_urn, DocumentationClass)
|
||||
if documentations:
|
||||
mutation_needed = False
|
||||
action_sourced = False
|
||||
# we check if there are any existing documentations generated by
|
||||
# this action and sourced from the same origin, if so, we update them
|
||||
# otherwise, we add a new documentation entry sourced by this action
|
||||
for doc_association in documentations.documentations[:]:
|
||||
if doc_association.attribution and doc_association.attribution.source:
|
||||
source_details_parsed: SourceDetails = SourceDetails.parse_obj(
|
||||
doc_association.attribution.sourceDetail
|
||||
)
|
||||
if doc_association.attribution.source == self.action_urn and (
|
||||
source_details_parsed.origin == context.origin
|
||||
):
|
||||
action_sourced = True
|
||||
if doc_association.documentation != field_doc:
|
||||
mutation_needed = True
|
||||
if operation == "ADD" or operation == "MODIFY":
|
||||
doc_association.documentation = field_doc or ""
|
||||
doc_association.attribution = attribution
|
||||
elif operation == "REMOVE":
|
||||
documentations.documentations.remove(doc_association)
|
||||
if not action_sourced:
|
||||
documentations.documentations.append(
|
||||
DocumentationAssociationClass(
|
||||
documentation=field_doc or "",
|
||||
attribution=attribution,
|
||||
)
|
||||
)
|
||||
mutation_needed = True
|
||||
else:
|
||||
# no docs found, create a new one
|
||||
# we don't check editableSchemaMetadata because our goal is to
|
||||
# propagate documentation to downstream entities
|
||||
# UI will handle resolving priorities and conflicts
|
||||
documentations = DocumentationClass(
|
||||
documentations=[
|
||||
DocumentationAssociationClass(
|
||||
documentation=field_doc or "",
|
||||
attribution=attribution,
|
||||
)
|
||||
]
|
||||
)
|
||||
mutation_needed = True
|
||||
|
||||
if mutation_needed:
|
||||
logger.debug(
|
||||
f"Will emit documentation change proposal for {schema_field_urn} with {field_doc}"
|
||||
)
|
||||
return MetadataChangeProposalWrapper(
|
||||
entityUrn=schema_field_urn,
|
||||
aspect=documentations,
|
||||
)
|
||||
return None
|
||||
|
||||
def refresh_config(self, event: Optional[EventEnvelope] = None) -> None:
|
||||
"""
|
||||
Fetches important configuration flags from the global settings entity to
|
||||
override client-side settings.
|
||||
If not found, it will use the client-side values.
|
||||
"""
|
||||
now = time.time()
|
||||
try:
|
||||
if now - self.last_config_refresh > 60 or self._is_settings_change(event):
|
||||
assert self.ctx.graph
|
||||
entity_dict = self.ctx.graph.graph.get_entity_raw(
|
||||
"urn:li:globalSettings:0", ["globalSettingsInfo"]
|
||||
)
|
||||
if entity_dict:
|
||||
global_settings = entity_dict.get("aspects", {}).get(
|
||||
"globalSettingsInfo"
|
||||
)
|
||||
if global_settings:
|
||||
doc_propagation_config = global_settings.get("value", {}).get(
|
||||
"docPropagation"
|
||||
)
|
||||
if doc_propagation_config:
|
||||
if doc_propagation_config.get("enabled") is not None:
|
||||
logger.info(
|
||||
"Overwriting the asset-level config using globalSettings"
|
||||
)
|
||||
self.config.enabled = doc_propagation_config.get(
|
||||
"enabled"
|
||||
)
|
||||
if (
|
||||
doc_propagation_config.get("columnPropagationEnabled")
|
||||
is not None
|
||||
):
|
||||
logger.info(
|
||||
"Overwriting the column-level config using globalSettings"
|
||||
)
|
||||
self.config.columns_enabled = (
|
||||
doc_propagation_config.get(
|
||||
"columnPropagationEnabled"
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
# We don't want to fail the pipeline if we can't fetch the config
|
||||
logger.warning(
|
||||
"Error fetching global settings for doc propagation. Will try again in 1 minute.",
|
||||
exc_info=True,
|
||||
)
|
||||
self.last_config_refresh = now
|
||||
|
||||
def _is_settings_change(self, event: Optional[EventEnvelope]) -> bool:
|
||||
if event and isinstance(event.event, MetadataChangeLogClass):
|
||||
entity_type = event.event.entityType
|
||||
if entity_type == "globalSettings":
|
||||
return True
|
||||
return False
|
||||
|
||||
def _only_one_upstream_field(
|
||||
self,
|
||||
graph: AcrylDataHubGraph,
|
||||
downstream_field: str,
|
||||
upstream_field: str,
|
||||
) -> bool:
|
||||
"""
|
||||
Check if there is only one upstream field for the downstream field. If upstream_field is provided,
|
||||
it will also check if the upstream field is the only upstream
|
||||
|
||||
TODO: We should cache upstreams because we make this fetch upstreams call FOR EVERY downstream that must be propagated to.
|
||||
"""
|
||||
upstreams = graph.get_upstreams(entity_urn=downstream_field)
|
||||
# Use a set here in case there are duplicated upstream edges
|
||||
upstream_fields = list(
|
||||
{x for x in upstreams if guess_entity_type(x) == "schemaField"}
|
||||
)
|
||||
|
||||
# If we found no upstreams for the downstream field, simply skip.
|
||||
if not upstream_fields:
|
||||
logger.debug(
|
||||
f"No upstream fields found. Skipping propagation to downstream {downstream_field}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Convert the set to a list to access by index
|
||||
result = len(upstream_fields) == 1 and upstream_fields[0] == upstream_field
|
||||
if not result:
|
||||
logger.warning(
|
||||
f"Failed check for single upstream: Found upstream fields {upstream_fields} for downstream {downstream_field}. Expecting only one upstream field: {upstream_field}"
|
||||
)
|
||||
return result
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
assert self.ctx.graph
|
||||
for mcp in self.act_async(event):
|
||||
self._rate_limited_emit_mcp(mcp)
|
||||
|
||||
def act_async(
|
||||
self, event: EventEnvelope
|
||||
) -> Iterable[MetadataChangeProposalWrapper]:
|
||||
"""
|
||||
Process the event asynchronously and return the change proposals
|
||||
"""
|
||||
self.refresh_config(event)
|
||||
if not self.config.enabled or not self.config.columns_enabled:
|
||||
logger.warning("Doc propagation is disabled. Skipping event")
|
||||
return
|
||||
else:
|
||||
logger.debug(f"Processing event {event}")
|
||||
|
||||
if not self._stats.event_processing_stats:
|
||||
self._stats.event_processing_stats = EventProcessingStats()
|
||||
|
||||
stats = self._stats.event_processing_stats
|
||||
stats.start(event)
|
||||
|
||||
try:
|
||||
doc_propagation_directive = self.should_propagate(event)
|
||||
# breakpoint()
|
||||
logger.debug(
|
||||
f"Doc propagation directive for {event}: {doc_propagation_directive}"
|
||||
)
|
||||
|
||||
if (
|
||||
doc_propagation_directive is not None
|
||||
and doc_propagation_directive.propagate
|
||||
):
|
||||
self._stats.increment_assets_processed(doc_propagation_directive.entity)
|
||||
context = SourceDetails(
|
||||
origin=doc_propagation_directive.origin,
|
||||
via=doc_propagation_directive.via,
|
||||
propagated=True,
|
||||
actor=doc_propagation_directive.actor,
|
||||
propagation_started_at=doc_propagation_directive.propagation_started_at,
|
||||
propagation_depth=doc_propagation_directive.propagation_depth,
|
||||
)
|
||||
assert self.ctx.graph
|
||||
logger.debug(f"Doc Propagation Directive: {doc_propagation_directive}")
|
||||
# TODO: Put each mechanism behind a config flag to be controlled
|
||||
# externally.
|
||||
lineage_downstream = (
|
||||
RelationshipType.LINEAGE,
|
||||
DirectionType.DOWN,
|
||||
) in doc_propagation_directive.relationships
|
||||
lineage_upstream = (
|
||||
RelationshipType.LINEAGE,
|
||||
DirectionType.UP,
|
||||
) in doc_propagation_directive.relationships
|
||||
lineage_any = (
|
||||
RelationshipType.LINEAGE,
|
||||
DirectionType.ALL,
|
||||
) in doc_propagation_directive.relationships
|
||||
logger.debug(
|
||||
f"Lineage Downstream: {lineage_downstream}, Lineage Upstream: {lineage_upstream}, Lineage Any: {lineage_any}"
|
||||
)
|
||||
if lineage_downstream or lineage_any:
|
||||
# Step 1: Propagate to downstream entities
|
||||
yield from self._propagate_to_downstreams(
|
||||
doc_propagation_directive, context
|
||||
)
|
||||
|
||||
if lineage_upstream or lineage_any:
|
||||
# Step 2: Propagate to upstream entities
|
||||
yield from self._propagate_to_upstreams(
|
||||
doc_propagation_directive, context
|
||||
)
|
||||
if (
|
||||
RelationshipType.SIBLING,
|
||||
DirectionType.ALL,
|
||||
) in doc_propagation_directive.relationships:
|
||||
# Step 3: Propagate to sibling entities
|
||||
yield from self._propagate_to_siblings(
|
||||
doc_propagation_directive, context
|
||||
)
|
||||
stats.end(event, success=True)
|
||||
|
||||
except Exception:
|
||||
logger.error(f"Error processing event {event}:", exc_info=True)
|
||||
stats.end(event, success=False)
|
||||
|
||||
def _propagate_to_downstreams(
|
||||
self, doc_propagation_directive: DocPropagationDirective, context: SourceDetails
|
||||
) -> Iterable[MetadataChangeProposalWrapper]:
|
||||
"""
|
||||
Propagate the documentation to downstream entities.
|
||||
"""
|
||||
assert self.ctx.graph
|
||||
downstreams = self.ctx.graph.get_downstreams(
|
||||
entity_urn=doc_propagation_directive.entity
|
||||
)
|
||||
logger.debug(
|
||||
f"Downstreams: {downstreams} for {doc_propagation_directive.entity}"
|
||||
)
|
||||
entity_urn = doc_propagation_directive.entity
|
||||
propagated_context = SourceDetails.parse_obj(context.dict())
|
||||
propagated_context.propagation_relationship = RelationshipType.LINEAGE
|
||||
propagated_context.propagation_direction = DirectionType.DOWN
|
||||
propagated_entities_this_hop_count = 0
|
||||
# breakpoint()
|
||||
if guess_entity_type(entity_urn) == "schemaField":
|
||||
downstream_fields = {
|
||||
x for x in downstreams if guess_entity_type(x) == "schemaField"
|
||||
}
|
||||
for field in downstream_fields:
|
||||
schema_field_urn = Urn.from_string(field)
|
||||
parent_urn = schema_field_urn.get_entity_id()[0]
|
||||
field_path = schema_field_urn.get_entity_id()[1]
|
||||
|
||||
logger.debug(
|
||||
f"Will {doc_propagation_directive.operation} documentation {doc_propagation_directive.doc_string} for {field_path} on {parent_urn}"
|
||||
)
|
||||
|
||||
parent_entity_type = guess_entity_type(parent_urn)
|
||||
|
||||
if parent_entity_type == "dataset":
|
||||
if self._only_one_upstream_field(
|
||||
self.ctx.graph,
|
||||
downstream_field=str(schema_field_urn),
|
||||
upstream_field=entity_urn,
|
||||
):
|
||||
if (
|
||||
propagated_entities_this_hop_count
|
||||
>= self.config.max_propagation_fanout
|
||||
):
|
||||
# breakpoint()
|
||||
logger.warning(
|
||||
f"Exceeded max propagation fanout of {self.config.max_propagation_fanout}. Skipping propagation to downstream {field}"
|
||||
)
|
||||
# No need to propagate to more downstreams
|
||||
return
|
||||
|
||||
maybe_mcp = self.modify_docs_on_columns(
|
||||
self.ctx.graph,
|
||||
doc_propagation_directive.operation,
|
||||
field,
|
||||
parent_urn,
|
||||
field_doc=doc_propagation_directive.doc_string,
|
||||
context=propagated_context,
|
||||
)
|
||||
if maybe_mcp:
|
||||
propagated_entities_this_hop_count += 1
|
||||
yield maybe_mcp
|
||||
|
||||
elif parent_entity_type == "chart":
|
||||
logger.warning(
|
||||
"Charts are expected to have fields that are dataset schema fields. Skipping for now..."
|
||||
)
|
||||
|
||||
self._stats.increment_assets_impacted(field)
|
||||
|
||||
elif guess_entity_type(entity_urn) == "dataset":
|
||||
logger.debug(
|
||||
"Dataset level documentation propagation is not yet supported!"
|
||||
)
|
||||
|
||||
def _propagate_to_upstreams(
|
||||
self, doc_propagation_directive: DocPropagationDirective, context: SourceDetails
|
||||
) -> Iterable[MetadataChangeProposalWrapper]:
|
||||
"""
|
||||
Propagate the documentation to upstream entities.
|
||||
"""
|
||||
assert self.ctx.graph
|
||||
upstreams = self.ctx.graph.get_upstreams(
|
||||
entity_urn=doc_propagation_directive.entity
|
||||
)
|
||||
logger.debug(f"Upstreams: {upstreams} for {doc_propagation_directive.entity}")
|
||||
entity_urn = doc_propagation_directive.entity
|
||||
propagated_context = SourceDetails.parse_obj(context.dict())
|
||||
propagated_context.propagation_relationship = RelationshipType.LINEAGE
|
||||
propagated_context.propagation_direction = DirectionType.UP
|
||||
propagated_entities_this_hop_count = 0
|
||||
|
||||
if guess_entity_type(entity_urn) == "schemaField":
|
||||
upstream_fields = {
|
||||
x for x in upstreams if guess_entity_type(x) == "schemaField"
|
||||
}
|
||||
# We only propagate to the upstream field if there is only one
|
||||
# upstream field
|
||||
if len(upstream_fields) == 1:
|
||||
for field in upstream_fields:
|
||||
schema_field_urn = Urn.from_string(field)
|
||||
parent_urn = schema_field_urn.get_entity_id()[0]
|
||||
field_path = schema_field_urn.get_entity_id()[1]
|
||||
|
||||
logger.debug(
|
||||
f"Will {doc_propagation_directive.operation} documentation {doc_propagation_directive.doc_string} for {field_path} on {parent_urn}"
|
||||
)
|
||||
|
||||
parent_entity_type = guess_entity_type(parent_urn)
|
||||
|
||||
if parent_entity_type == "dataset":
|
||||
if (
|
||||
propagated_entities_this_hop_count
|
||||
>= self.config.max_propagation_fanout
|
||||
):
|
||||
logger.warning(
|
||||
f"Exceeded max propagation fanout of {self.config.max_propagation_fanout}. Skipping propagation to upstream {field}"
|
||||
)
|
||||
# No need to propagate to more upstreams
|
||||
return
|
||||
maybe_mcp = self.modify_docs_on_columns(
|
||||
self.ctx.graph,
|
||||
doc_propagation_directive.operation,
|
||||
field,
|
||||
parent_urn,
|
||||
field_doc=doc_propagation_directive.doc_string,
|
||||
context=propagated_context,
|
||||
)
|
||||
if maybe_mcp:
|
||||
propagated_entities_this_hop_count += 1
|
||||
yield maybe_mcp
|
||||
|
||||
elif parent_entity_type == "chart":
|
||||
logger.warning(
|
||||
"Charts are expected to have fields that are dataset schema fields. Skipping for now..."
|
||||
)
|
||||
|
||||
self._stats.increment_assets_impacted(field)
|
||||
|
||||
elif guess_entity_type(entity_urn) == "dataset":
|
||||
logger.debug(
|
||||
"Dataset level documentation propagation is not yet supported!"
|
||||
)
|
||||
|
||||
def _propagate_to_siblings(
|
||||
self, doc_propagation_directive: DocPropagationDirective, context: SourceDetails
|
||||
) -> Iterable[MetadataChangeProposalWrapper]:
|
||||
"""
|
||||
Propagate the documentation to sibling entities.
|
||||
"""
|
||||
assert self.ctx.graph
|
||||
entity_urn = doc_propagation_directive.entity
|
||||
siblings = get_unique_siblings(self.ctx.graph, entity_urn)
|
||||
propagated_context = SourceDetails.parse_obj(context.dict())
|
||||
propagated_context.propagation_relationship = RelationshipType.SIBLING
|
||||
propagated_context.propagation_direction = DirectionType.ALL
|
||||
|
||||
logger.debug(f"Siblings: {siblings} for {doc_propagation_directive.entity}")
|
||||
|
||||
for sibling in siblings:
|
||||
if (
|
||||
guess_entity_type(entity_urn) == "schemaField"
|
||||
and guess_entity_type(sibling) == "schemaField"
|
||||
):
|
||||
parent_urn = Urn.from_string(sibling).get_entity_id()[0]
|
||||
self._stats.increment_assets_impacted(sibling)
|
||||
maybe_mcp = self.modify_docs_on_columns(
|
||||
self.ctx.graph,
|
||||
doc_propagation_directive.operation,
|
||||
schema_field_urn=sibling,
|
||||
dataset_urn=parent_urn,
|
||||
field_doc=doc_propagation_directive.doc_string,
|
||||
context=propagated_context,
|
||||
)
|
||||
if maybe_mcp:
|
||||
yield maybe_mcp
|
||||
|
||||
def close(self) -> None:
|
||||
return
|
@ -0,0 +1,289 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from enum import Enum
|
||||
from functools import wraps
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from pydantic import validator
|
||||
from pydantic.fields import Field
|
||||
from pydantic.main import BaseModel
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
|
||||
import datahub.metadata.schema_classes as models
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.emitter.mce_builder import make_schema_field_urn
|
||||
from datahub.ingestion.graph.client import DataHubGraph
|
||||
from datahub.ingestion.graph.filters import SearchFilterRule
|
||||
from datahub.metadata.schema_classes import MetadataAttributionClass
|
||||
from datahub.utilities.str_enum import StrEnum
|
||||
from datahub.utilities.urns.urn import Urn, guess_entity_type
|
||||
from datahub_actions.api.action_graph import AcrylDataHubGraph
|
||||
|
||||
SYSTEM_ACTOR = "urn:li:corpuser:__datahub_system"
|
||||
|
||||
|
||||
class RelationshipType(StrEnum):
|
||||
LINEAGE = "lineage" # signifies all types of lineage
|
||||
HIERARCHY = "hierarchy" # signifies all types of hierarchy
|
||||
SIBLING = "sibling" # signifies all types of sibling
|
||||
|
||||
|
||||
class DirectionType(StrEnum):
|
||||
UP = "up" # signifies upstream or parent (depending on relationship type)
|
||||
DOWN = "down" # signifies downstream or child (depending on relationship type)
|
||||
ALL = "all" # signifies all directions
|
||||
|
||||
|
||||
class PropagationDirective(BaseModel):
|
||||
propagate: bool
|
||||
operation: str
|
||||
relationships: List[Tuple[RelationshipType, DirectionType]]
|
||||
entity: str = Field(
|
||||
description="Entity that currently triggered the propagation directive",
|
||||
)
|
||||
origin: str = Field(
|
||||
description="Origin entity for the association. This is the entity that triggered the propagation.",
|
||||
)
|
||||
via: Optional[str] = Field(
|
||||
None,
|
||||
description="Via entity for the association. This is the direct entity that the propagation came through.",
|
||||
)
|
||||
actor: Optional[str] = Field(
|
||||
None,
|
||||
description="Actor that triggered the propagation through the original association.",
|
||||
)
|
||||
propagation_started_at: Optional[int] = Field(
|
||||
None,
|
||||
description="Timestamp (in millis) when the original propagation event happened.",
|
||||
)
|
||||
propagation_depth: Optional[int] = Field(
|
||||
default=0,
|
||||
description="Depth of propagation. This is used to track the depth of the propagation.",
|
||||
)
|
||||
|
||||
|
||||
class SourceDetails(BaseModel):
|
||||
origin: Optional[str] = Field(
|
||||
None,
|
||||
description="Origin entity for the documentation. This is the entity that triggered the documentation propagation.",
|
||||
)
|
||||
via: Optional[str] = Field(
|
||||
None,
|
||||
description="Via entity for the documentation. This is the direct entity that the documentation was propagated through.",
|
||||
)
|
||||
propagated: Optional[str] = Field(
|
||||
None,
|
||||
description="Indicates whether the metadata element was propagated.",
|
||||
)
|
||||
actor: Optional[str] = Field(
|
||||
None,
|
||||
description="Actor that triggered the metadata propagation.",
|
||||
)
|
||||
propagation_started_at: Optional[int] = Field(
|
||||
None,
|
||||
description="Timestamp when the metadata propagation event happened.",
|
||||
)
|
||||
propagation_depth: Optional[int] = Field(
|
||||
default=0,
|
||||
description="Depth of metadata propagation.",
|
||||
)
|
||||
propagation_relationship: Optional[RelationshipType] = Field(
|
||||
None,
|
||||
description="The relationship that the metadata was propagated through.",
|
||||
)
|
||||
propagation_direction: Optional[DirectionType] = Field(
|
||||
None,
|
||||
description="The direction that the metadata was propagated through.",
|
||||
)
|
||||
|
||||
@validator("propagated", pre=True)
|
||||
def convert_boolean_to_lowercase_string(cls, v: Any) -> Optional[str]:
|
||||
if isinstance(v, bool):
|
||||
return str(v).lower()
|
||||
return v
|
||||
|
||||
@validator("propagation_depth", "propagation_started_at", pre=True)
|
||||
def convert_to_int(cls, v: Any) -> Optional[int]:
|
||||
if v is not None:
|
||||
return int(v)
|
||||
return v
|
||||
|
||||
def for_metadata_attribution(self) -> Dict[str, str]:
|
||||
"""
|
||||
Convert the SourceDetails object to a dictionary that can be used in
|
||||
Metadata Attribution MCPs.
|
||||
"""
|
||||
result = {}
|
||||
for k, v in self.dict(exclude_none=True).items():
|
||||
if isinstance(v, Enum):
|
||||
result[k] = v.value # Use the enum's value
|
||||
elif isinstance(v, int):
|
||||
result[k] = str(v) # Convert int to string
|
||||
else:
|
||||
result[k] = str(v) # Convert everything else to string
|
||||
return result
|
||||
|
||||
|
||||
class PropagationConfig(ConfigModel):
|
||||
"""
|
||||
Base class for all propagation configs
|
||||
"""
|
||||
|
||||
max_propagation_depth: int = 5
|
||||
max_propagation_fanout: int = 1000
|
||||
max_propagation_time_millis: int = 1000 * 60 * 60 * 1 # 1 hour
|
||||
rate_limit_propagated_writes: int = 15000 # 15000 writes per 15 seconds (default)
|
||||
rate_limit_propagated_writes_period: int = 15 # Every 15 seconds
|
||||
|
||||
def get_rate_limited_emit_mcp(self, emitter: DataHubGraph) -> Any:
|
||||
"""
|
||||
Returns a rate limited emitter that can be used to emit metadata for propagation
|
||||
"""
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(
|
||||
calls=self.rate_limit_propagated_writes,
|
||||
period=self.rate_limit_propagated_writes_period,
|
||||
)
|
||||
@wraps(emitter.emit_mcp)
|
||||
def wrapper(*args, **kwargs):
|
||||
return emitter.emit_mcp(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def get_attribution_and_context_from_directive(
|
||||
action_urn: str,
|
||||
propagation_directive: PropagationDirective,
|
||||
actor: str = SYSTEM_ACTOR,
|
||||
time: int = int(time.time() * 1000.0),
|
||||
) -> Tuple[MetadataAttributionClass, str]:
|
||||
"""
|
||||
Given a propagation directive, return the attribution and context for
|
||||
the directive.
|
||||
Attribution is the official way to track the source of metadata in
|
||||
DataHub.
|
||||
Context is the older way to track the source of metadata in DataHub.
|
||||
We populate both to ensure compatibility with older versions of DataHub.
|
||||
"""
|
||||
source_detail: dict[str, str] = {
|
||||
"origin": propagation_directive.origin,
|
||||
"propagated": "true",
|
||||
"propagation_depth": str(propagation_directive.propagation_depth),
|
||||
"propagation_started_at": str(
|
||||
propagation_directive.propagation_started_at
|
||||
if propagation_directive.propagation_started_at
|
||||
else time
|
||||
),
|
||||
}
|
||||
if propagation_directive.relationships:
|
||||
source_detail["propagation_relationship"] = propagation_directive.relationships[
|
||||
0
|
||||
][0].value
|
||||
source_detail["propagation_direction"] = propagation_directive.relationships[0][
|
||||
1
|
||||
].value
|
||||
if propagation_directive.actor:
|
||||
source_detail["actor"] = propagation_directive.actor
|
||||
else:
|
||||
source_detail["actor"] = actor
|
||||
if propagation_directive.via:
|
||||
source_detail["via"] = propagation_directive.via
|
||||
context_dict: dict[str, str] = {}
|
||||
context_dict.update(source_detail)
|
||||
return (
|
||||
MetadataAttributionClass(
|
||||
time=time,
|
||||
actor=actor,
|
||||
source=action_urn,
|
||||
sourceDetail=source_detail,
|
||||
),
|
||||
json.dumps(context_dict),
|
||||
)
|
||||
|
||||
|
||||
class SelectedAsset(BaseModel):
|
||||
"""
|
||||
A selected asset is a data structure that represents an asset that has been
|
||||
selected for processing by a propagator.
|
||||
"""
|
||||
|
||||
urn: str # URN of the asset that has been selected
|
||||
target_entity_type: str # entity type that is being targeted by the propagator. e.g. schemaField even if asset is of type dataset
|
||||
|
||||
|
||||
class ComposablePropagator:
|
||||
@abstractmethod
|
||||
def asset_filters(self) -> Dict[str, Dict[str, List[SearchFilterRule]]]:
|
||||
"""
|
||||
Returns a dictionary of asset filters that are used to filter the assets
|
||||
based on the configuration of the action.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process_one_asset(
|
||||
self, asset: SelectedAsset, operation: str
|
||||
) -> Iterable[PropagationDirective]:
|
||||
"""
|
||||
Given an asset, returns a list of propagation directives
|
||||
|
||||
:param asset_urn: URN of the asset
|
||||
:param target_entity_type: The entity type of the target entity (Note:
|
||||
this can be different from the entity type of the asset. e.g. we
|
||||
might process a dataset while the target entity_type is a column
|
||||
(schemaField))
|
||||
:param operation: The operation that triggered the propagation (ADD /
|
||||
REMOVE)
|
||||
:return: A list of PropagationDirective objects
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def get_unique_siblings(graph: AcrylDataHubGraph, entity_urn: str) -> list[str]:
|
||||
"""
|
||||
Get unique siblings for the entity urn
|
||||
"""
|
||||
|
||||
if guess_entity_type(entity_urn) == "schemaField":
|
||||
parent_urn = Urn.from_string(entity_urn).get_entity_id()[0]
|
||||
entity_field_path = Urn.from_string(entity_urn).get_entity_id()[1]
|
||||
# Does my parent have siblings?
|
||||
siblings: Optional[models.SiblingsClass] = graph.graph.get_aspect(
|
||||
parent_urn,
|
||||
models.SiblingsClass,
|
||||
)
|
||||
if siblings and siblings.siblings:
|
||||
other_siblings = [x for x in siblings.siblings if x != parent_urn]
|
||||
if len(other_siblings) == 1:
|
||||
target_sibling = other_siblings[0]
|
||||
# now we need to find the schema field in this sibling that
|
||||
# matches us
|
||||
if guess_entity_type(target_sibling) == "dataset":
|
||||
schema_fields = graph.graph.get_aspect(
|
||||
target_sibling, models.SchemaMetadataClass
|
||||
)
|
||||
if schema_fields:
|
||||
for schema_field in schema_fields.fields:
|
||||
if schema_field.fieldPath == entity_field_path:
|
||||
# we found the sibling field
|
||||
schema_field_urn = make_schema_field_urn(
|
||||
target_sibling, schema_field.fieldPath
|
||||
)
|
||||
return [schema_field_urn]
|
||||
return []
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
146
datahub-actions/src/datahub_actions/plugin/action/slack/slack.py
Normal file
146
datahub-actions/src/datahub_actions/plugin/action/slack/slack.py
Normal file
@ -0,0 +1,146 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List
|
||||
|
||||
from pydantic import SecretStr
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
from requests import sessions
|
||||
from slack_bolt import App
|
||||
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.metadata.schema_classes import EntityChangeEventClass as EntityChangeEvent
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
from datahub_actions.utils.datahub_util import DATAHUB_SYSTEM_ACTOR_URN
|
||||
from datahub_actions.utils.social_util import (
|
||||
StructuredMessage,
|
||||
get_message_from_entity_change_event,
|
||||
get_welcome_message,
|
||||
pretty_any_text,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=1, period=1)
|
||||
def post_message(client, token, channel, text):
|
||||
client.chat_postMessage(
|
||||
token=token,
|
||||
channel=channel,
|
||||
text=text,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SlackNotification:
|
||||
@staticmethod
|
||||
def get_payload(message: StructuredMessage) -> List[Dict]:
|
||||
return [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {"type": "mrkdwn", "text": message.title},
|
||||
},
|
||||
{"type": "divider"},
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "\n".join(
|
||||
[
|
||||
f"*{k}*: {pretty_any_text(v, channel='slack')}"
|
||||
for k, v in message.properties.items()
|
||||
]
|
||||
),
|
||||
},
|
||||
},
|
||||
{"type": "divider"},
|
||||
]
|
||||
|
||||
|
||||
class SlackNotificationConfig(ConfigModel):
|
||||
# default webhook posts to #actions-dev-slack-notifications on Acryl Data Slack space
|
||||
bot_token: SecretStr
|
||||
signing_secret: SecretStr
|
||||
default_channel: str
|
||||
base_url: str = "http://localhost:9002/"
|
||||
suppress_system_activity: bool = True
|
||||
|
||||
|
||||
class SlackNotificationAction(Action):
|
||||
def name(self):
|
||||
return "SlackNotificationAction"
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
action_config = SlackNotificationConfig.parse_obj(config_dict or {})
|
||||
logger.info(f"Slack notification action configured with {action_config}")
|
||||
return cls(action_config, ctx)
|
||||
|
||||
def __init__(self, action_config: SlackNotificationConfig, ctx: PipelineContext):
|
||||
self.action_config = action_config
|
||||
self.ctx = ctx
|
||||
self.session = sessions.Session()
|
||||
|
||||
# Initializes your app with your bot token and signing secret
|
||||
self.app = App(
|
||||
token=self.action_config.bot_token.get_secret_value(),
|
||||
signing_secret=self.action_config.signing_secret.get_secret_value(),
|
||||
)
|
||||
|
||||
self.app.client.chat_postMessage(
|
||||
token=self.action_config.bot_token.get_secret_value(),
|
||||
channel=self.action_config.default_channel,
|
||||
blocks=SlackNotification.get_payload(
|
||||
get_welcome_message(self.action_config.base_url)
|
||||
),
|
||||
)
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
try:
|
||||
message = json.dumps(json.loads(event.as_json()), indent=4)
|
||||
logger.debug(f"Received event: {message}")
|
||||
if event.event_type == "EntityChangeEvent_v1":
|
||||
assert isinstance(event.event, EntityChangeEvent)
|
||||
if (
|
||||
event.event.auditStamp.actor == DATAHUB_SYSTEM_ACTOR_URN
|
||||
and self.action_config.suppress_system_activity
|
||||
):
|
||||
return None
|
||||
|
||||
semantic_message = get_message_from_entity_change_event(
|
||||
event.event,
|
||||
self.action_config.base_url,
|
||||
self.ctx.graph.graph if self.ctx.graph else None,
|
||||
channel="slack",
|
||||
)
|
||||
if semantic_message:
|
||||
post_message(
|
||||
client=self.app.client,
|
||||
token=self.action_config.bot_token.get_secret_value(),
|
||||
channel=self.action_config.default_channel,
|
||||
text=semantic_message,
|
||||
)
|
||||
else:
|
||||
logger.debug("Skipping message because it didn't match our filter")
|
||||
except Exception as e:
|
||||
logger.debug("Failed to process event", e)
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,130 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from datahub.emitter.mce_builder import dataset_urn_to_key
|
||||
from datahub.ingestion.api.closeable import Closeable
|
||||
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeConfig
|
||||
from datahub.metadata.schema_classes import GlossaryNodeInfoClass, GlossaryTermInfoClass
|
||||
from datahub.utilities.urns.urn import Urn
|
||||
from datahub_actions.api.action_graph import AcrylDataHubGraph
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SnowflakeTagHelper(Closeable):
|
||||
def __init__(self, config: SnowflakeConfig):
|
||||
self.config: SnowflakeConfig = config
|
||||
url = self.config.get_sql_alchemy_url()
|
||||
self.engine = create_engine(url, **self.config.get_options())
|
||||
|
||||
@staticmethod
|
||||
def get_term_name_from_id(term_urn: str, graph: AcrylDataHubGraph) -> str:
|
||||
term_id = Urn.from_string(term_urn).get_entity_id_as_string()
|
||||
if term_id.count("-") == 4:
|
||||
# needs resolution
|
||||
term_info = graph.graph.get_aspect(term_urn, GlossaryTermInfoClass)
|
||||
assert term_info
|
||||
assert term_info.name
|
||||
term_name = term_info.name
|
||||
parent = term_info.parentNode
|
||||
while parent:
|
||||
parent_id = Urn.from_string(parent).get_entity_id_as_string()
|
||||
node_info = graph.graph.get_aspect(parent, GlossaryNodeInfoClass)
|
||||
assert node_info
|
||||
if parent_id.count("-") == 4:
|
||||
parent_name = node_info.name
|
||||
parent = node_info.parentNode
|
||||
else:
|
||||
# terminate
|
||||
parent_name = parent_id
|
||||
parent = None
|
||||
term_name = f"{parent_name}.{term_name}"
|
||||
else:
|
||||
term_name = term_id
|
||||
|
||||
return term_name
|
||||
|
||||
@staticmethod
|
||||
def get_label_urn_to_tag(label_urn: str, graph: AcrylDataHubGraph) -> str:
|
||||
label_urn_parsed = Urn.from_string(label_urn)
|
||||
if label_urn_parsed.get_type() == "tag":
|
||||
return label_urn_parsed.get_entity_id_as_string()
|
||||
elif label_urn_parsed.get_type() == "glossaryTerm":
|
||||
# if this looks like a guid, we want to resolve to human friendly names
|
||||
term_name = SnowflakeTagHelper.get_term_name_from_id(label_urn, graph)
|
||||
if term_name is not None:
|
||||
# terms use `.` for separation, replace with _
|
||||
return term_name.replace(".", "_").replace(" ", "_")
|
||||
else:
|
||||
raise ValueError(f"Invalid tag or term urn {label_urn}")
|
||||
else:
|
||||
raise Exception(
|
||||
f"Unexpected label type: neither tag or term {label_urn_parsed.get_type()}"
|
||||
)
|
||||
|
||||
def apply_tag_or_term(
|
||||
self, dataset_urn: str, tag_or_term_urn: str, graph: AcrylDataHubGraph
|
||||
) -> None:
|
||||
dataset_key = dataset_urn_to_key(dataset_urn)
|
||||
assert dataset_key is not None
|
||||
if dataset_key.platform != "snowflake":
|
||||
return
|
||||
tag = self.get_label_urn_to_tag(tag_or_term_urn, graph)
|
||||
assert tag is not None
|
||||
name_tokens = dataset_key.name.split(".")
|
||||
assert len(name_tokens) == 3
|
||||
self.run_query(
|
||||
name_tokens[0],
|
||||
name_tokens[1],
|
||||
f"CREATE TAG IF NOT EXISTS {tag} COMMENT = 'Replicated Tag {tag_or_term_urn} from DataHub';",
|
||||
)
|
||||
self.run_query(
|
||||
name_tokens[0],
|
||||
name_tokens[1],
|
||||
f'ALTER TABLE {name_tokens[2]} SET TAG {tag}="{tag_or_term_urn}";',
|
||||
)
|
||||
|
||||
def remove_tag_or_term(
|
||||
self, dataset_urn: str, tag_urn: str, graph: AcrylDataHubGraph
|
||||
) -> None:
|
||||
dataset_key = dataset_urn_to_key(dataset_urn)
|
||||
assert dataset_key is not None
|
||||
if dataset_key.platform != "snowflake":
|
||||
return
|
||||
tag = self.get_label_urn_to_tag(tag_urn, graph)
|
||||
assert tag is not None
|
||||
name_tokens = dataset_key.name.split(".")
|
||||
assert len(name_tokens) == 3
|
||||
self.run_query(
|
||||
name_tokens[0],
|
||||
name_tokens[1],
|
||||
f"ALTER TABLE {name_tokens[2]} UNSET TAG {tag};",
|
||||
)
|
||||
|
||||
def run_query(self, database: str, schema: str, query: str) -> None:
|
||||
try:
|
||||
self.engine.execute(f"USE {database}.{schema};")
|
||||
self.engine.execute(query)
|
||||
logger.info(f"Successfully executed query {query}")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to execute snowflake query: {query}. Exception: ", e
|
||||
)
|
||||
|
||||
def close(self) -> None:
|
||||
return
|
@ -0,0 +1,121 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.event.event_registry import EntityChangeEvent
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
from datahub_actions.plugin.action.snowflake.snowflake_util import SnowflakeTagHelper
|
||||
from datahub_actions.plugin.action.tag.tag_propagation_action import (
|
||||
TagPropagationAction,
|
||||
TagPropagationConfig,
|
||||
)
|
||||
from datahub_actions.plugin.action.term.term_propagation_action import (
|
||||
TermPropagationAction,
|
||||
TermPropagationConfig,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SnowflakeTagPropagatorConfig(ConfigModel):
|
||||
snowflake: SnowflakeV2Config
|
||||
tag_propagation: Optional[TagPropagationConfig] = None
|
||||
term_propagation: Optional[TermPropagationConfig] = None
|
||||
|
||||
|
||||
class SnowflakeTagPropagatorAction(Action):
|
||||
def __init__(self, config: SnowflakeTagPropagatorConfig, ctx: PipelineContext):
|
||||
self.config: SnowflakeTagPropagatorConfig = config
|
||||
self.ctx = ctx
|
||||
self.snowflake_tag_helper = SnowflakeTagHelper(self.config.snowflake)
|
||||
logger.info("[Config] Snowflake tag sync enabled")
|
||||
if self.config.tag_propagation:
|
||||
logger.info("[Config] Will propagate DataHub Tags")
|
||||
if self.config.tag_propagation.tag_prefixes:
|
||||
logger.info(
|
||||
f"[Config] Tag prefixes: {self.config.tag_propagation.tag_prefixes}"
|
||||
)
|
||||
self.tag_propagator = TagPropagationAction(self.config.tag_propagation, ctx)
|
||||
if self.config.term_propagation:
|
||||
logger.info("[Config] Will propagate Glossary Terms")
|
||||
self.term_propagator = TermPropagationAction(
|
||||
self.config.term_propagation, ctx
|
||||
)
|
||||
|
||||
def close(self) -> None:
|
||||
self.snowflake_tag_helper.close()
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
config = SnowflakeTagPropagatorConfig.parse_obj(config_dict or {})
|
||||
return cls(config, ctx)
|
||||
|
||||
@staticmethod
|
||||
def is_snowflake_urn(urn: str) -> bool:
|
||||
return urn.startswith("urn:li:dataset:(urn:li:dataPlatform:snowflake")
|
||||
|
||||
def name(self) -> str:
|
||||
return "SnowflakeTagPropagator"
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
if event.event_type == "EntityChangeEvent_v1":
|
||||
assert isinstance(event.event, EntityChangeEvent)
|
||||
assert self.ctx.graph is not None
|
||||
semantic_event = event.event
|
||||
if not self.is_snowflake_urn(semantic_event.entityUrn):
|
||||
return
|
||||
entity_to_apply = None
|
||||
tag_to_apply = None
|
||||
if self.tag_propagator is not None:
|
||||
tag_propagation_directive = self.tag_propagator.should_propagate(
|
||||
event=event
|
||||
)
|
||||
if (
|
||||
tag_propagation_directive is not None
|
||||
and tag_propagation_directive.propagate
|
||||
):
|
||||
entity_to_apply = tag_propagation_directive.entity
|
||||
tag_to_apply = tag_propagation_directive.tag
|
||||
|
||||
if self.term_propagator is not None:
|
||||
term_propagation_directive = self.term_propagator.should_propagate(
|
||||
event=event
|
||||
)
|
||||
if (
|
||||
term_propagation_directive is not None
|
||||
and term_propagation_directive.propagate
|
||||
):
|
||||
entity_to_apply = term_propagation_directive.entity
|
||||
tag_to_apply = term_propagation_directive.term
|
||||
|
||||
if entity_to_apply is not None:
|
||||
assert tag_to_apply
|
||||
logger.info(
|
||||
f"Will {semantic_event.operation.lower()} {tag_to_apply} on Snowflake {entity_to_apply}"
|
||||
)
|
||||
if semantic_event.operation == "ADD":
|
||||
self.snowflake_tag_helper.apply_tag_or_term(
|
||||
entity_to_apply, tag_to_apply, self.ctx.graph
|
||||
)
|
||||
else:
|
||||
self.snowflake_tag_helper.remove_tag_or_term(
|
||||
entity_to_apply, tag_to_apply, self.ctx.graph
|
||||
)
|
204
datahub-actions/src/datahub_actions/plugin/action/stats_util.py
Normal file
204
datahub-actions/src/datahub_actions/plugin/action/stats_util.py
Normal file
@ -0,0 +1,204 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Optional
|
||||
|
||||
import pydantic
|
||||
from pydantic import BaseModel
|
||||
|
||||
from datahub.ingestion.api.report import Report, SupportsAsObj
|
||||
from datahub.utilities.str_enum import StrEnum
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.event.event_registry import (
|
||||
ENTITY_CHANGE_EVENT_V1_TYPE,
|
||||
METADATA_CHANGE_LOG_EVENT_V1_TYPE,
|
||||
EntityChangeEvent,
|
||||
MetadataChangeLogEvent,
|
||||
)
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
|
||||
|
||||
class EventProcessingStats(BaseModel):
|
||||
"""
|
||||
A class to represent the event-oriented processing stats for a pipeline.
|
||||
Note: Might be merged into ActionStats in the future.
|
||||
"""
|
||||
|
||||
last_seen_event_time: Optional[str] = pydantic.Field(
|
||||
None, description="The event time of the last event we processed"
|
||||
)
|
||||
last_event_processed_time: Optional[str] = pydantic.Field(
|
||||
None, description="The time at which we processed the last event"
|
||||
)
|
||||
last_seen_event_time_success: Optional[str] = pydantic.Field(
|
||||
None, description="The event time of the last event we processed successfully"
|
||||
)
|
||||
last_event_processed_time_success: Optional[str] = pydantic.Field(
|
||||
None, description="The time at which we processed the last event successfully"
|
||||
)
|
||||
last_seen_event_time_failure: Optional[str] = pydantic.Field(
|
||||
None, description="The event time of the last event we processed unsuccessfully"
|
||||
)
|
||||
last_event_processed_time_failure: Optional[str] = pydantic.Field(
|
||||
None, description="The time at which we processed the last event unsuccessfully"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_event_time(cls, event: EventEnvelope) -> Optional[str]:
|
||||
"""
|
||||
Get the event time from the event.
|
||||
"""
|
||||
if event.event_type == ENTITY_CHANGE_EVENT_V1_TYPE:
|
||||
if isinstance(event.event, EntityChangeEvent):
|
||||
return (
|
||||
datetime.fromtimestamp(
|
||||
event.event.auditStamp.time / 1000.0, tz=timezone.utc
|
||||
).isoformat()
|
||||
if event.event.auditStamp
|
||||
else None
|
||||
)
|
||||
elif event.event_type == METADATA_CHANGE_LOG_EVENT_V1_TYPE:
|
||||
if isinstance(event.event, MetadataChangeLogEvent):
|
||||
return (
|
||||
datetime.fromtimestamp(
|
||||
event.event.auditHeader.time / 1000.0, tz=timezone.utc
|
||||
).isoformat()
|
||||
if event.event.auditHeader
|
||||
else None
|
||||
)
|
||||
return None
|
||||
|
||||
def start(self, event: EventEnvelope) -> None:
|
||||
"""
|
||||
Update the stats based on the event.
|
||||
"""
|
||||
self.last_event_processed_time = datetime.now(tz=timezone.utc).isoformat()
|
||||
self.last_seen_event_time = self._get_event_time(event)
|
||||
|
||||
def end(self, event: EventEnvelope, success: bool) -> None:
|
||||
"""
|
||||
Update the stats based on the event.
|
||||
"""
|
||||
|
||||
if success:
|
||||
self.last_seen_event_time_success = (
|
||||
self._get_event_time(event) or self.last_seen_event_time_success
|
||||
)
|
||||
self.last_event_processed_time_success = datetime.now(
|
||||
timezone.utc
|
||||
).isoformat()
|
||||
else:
|
||||
self.last_seen_event_time_failure = (
|
||||
self._get_event_time(event) or self.last_seen_event_time_failure
|
||||
)
|
||||
self.last_event_processed_time_failure = datetime.now(
|
||||
timezone.utc
|
||||
).isoformat()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return json.dumps(self.dict(), indent=2)
|
||||
|
||||
|
||||
class StageStatus(StrEnum):
|
||||
SUCCESS = "success"
|
||||
FAILURE = "failure"
|
||||
RUNNING = "running"
|
||||
STOPPED = "stopped"
|
||||
|
||||
|
||||
class ActionStageReport(BaseModel):
|
||||
# All stats here are only for the current run of the current stage.
|
||||
|
||||
# Attributes that should be aggregated across runs should be prefixed with "total_".
|
||||
# Only ints can be aggregated.
|
||||
|
||||
start_time: int = 0
|
||||
|
||||
end_time: int = 0
|
||||
|
||||
total_assets_to_process: int = -1 # -1 if unknown
|
||||
|
||||
total_assets_processed: int = 0
|
||||
|
||||
total_actions_executed: int = 0
|
||||
|
||||
total_assets_impacted: int = 0
|
||||
|
||||
event_processing_stats: Optional[EventProcessingStats] = None
|
||||
|
||||
status: Optional[StageStatus] = None
|
||||
|
||||
def start(self) -> None:
|
||||
self.start_time = int(datetime.now().timestamp() * 1000)
|
||||
self.status = StageStatus.RUNNING
|
||||
|
||||
def end(self, success: bool) -> None:
|
||||
self.end_time = int(datetime.now().timestamp() * 1000)
|
||||
self.status = StageStatus.SUCCESS if success else StageStatus.FAILURE
|
||||
|
||||
def increment_assets_processed(self, asset: str) -> None:
|
||||
# TODO: If we want to track unique assets, use a counting set.
|
||||
# For now, just increment
|
||||
self.total_assets_processed += 1
|
||||
|
||||
def increment_assets_impacted(self, asset: str) -> None:
|
||||
# TODO: If we want to track unique assets, use a counting set.
|
||||
# For now, just increment
|
||||
self.total_assets_impacted += 1
|
||||
|
||||
def as_obj(self) -> dict:
|
||||
return Report.to_pure_python_obj(self)
|
||||
|
||||
def aggregatable_stats(self) -> Dict[str, int]:
|
||||
all_items = self.dict()
|
||||
|
||||
stats = {k: v for k, v in all_items.items() if k.startswith("total_")}
|
||||
|
||||
# If total_assets_to_process is unknown, don't include it.
|
||||
if self.total_assets_to_process == -1:
|
||||
stats.pop("total_assets_to_process")
|
||||
|
||||
# Add a few additional special cases of aggregatable stats.
|
||||
if self.event_processing_stats:
|
||||
for key, value in self.event_processing_stats.dict().items():
|
||||
if value is not None:
|
||||
stats[f"event_processing_stats.{key}"] = str(value)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
class ReportingAction(Action, abc.ABC):
|
||||
def __init__(self, ctx: PipelineContext):
|
||||
super().__init__()
|
||||
self.ctx = ctx
|
||||
|
||||
self.action_urn: str
|
||||
if "urn:li:dataHubAction:" in ctx.pipeline_name:
|
||||
# The pipeline name might get a prefix before the urn:li:... part.
|
||||
# We need to remove that prefix to get the urn:li:dataHubAction part.
|
||||
action_urn_part = ctx.pipeline_name.split("urn:li:dataHubAction:")[1]
|
||||
self.action_urn = f"urn:li:dataHubAction:{action_urn_part}"
|
||||
else:
|
||||
self.action_urn = f"urn:li:dataHubAction:{ctx.pipeline_name}"
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_report(self) -> ActionStageReport:
|
||||
pass
|
||||
|
||||
|
||||
assert isinstance(ActionStageReport(), SupportsAsObj)
|
@ -0,0 +1,37 @@
|
||||
# Tag Sync Action
|
||||
|
||||
The Tag Sync (or Tag Propagation) Action allows you to propagate tags from your assets into downstream entities. e.g. You can apply a tag (like `critical`) on a dataset and have it propagate down to all the downstream datasets.
|
||||
|
||||
## Configurability
|
||||
|
||||
You can control which tags should be propagated downstream using a prefix system. E.g. You can specify that only tags that start with `tier:` should be propagated downstream.
|
||||
|
||||
## Additions and Removals
|
||||
|
||||
The action supports both additions and removals of tags.
|
||||
|
||||
### Example Config
|
||||
|
||||
```yaml
|
||||
name: "tag_propagation"
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
filter:
|
||||
event_type: "EntityChangeEvent_v1"
|
||||
action:
|
||||
type: "tag_propagation"
|
||||
config:
|
||||
tag_prefixes:
|
||||
- classification
|
||||
|
||||
datahub:
|
||||
server: "http://localhost:8080"
|
||||
```
|
||||
|
||||
## Caveats
|
||||
|
||||
- Tag Propagation is currently only supported for downstream datasets. Tags will not propagate to downstream dashboards or charts. Let us know if this is an important feature for you.
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,162 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field, validator
|
||||
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.emitter.mce_builder import make_tag_urn
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.event.event_registry import EntityChangeEvent
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class TagPropagationConfig(ConfigModel):
|
||||
"""
|
||||
Configuration model for tag propagation.
|
||||
|
||||
Attributes:
|
||||
enabled (bool): Indicates whether tag propagation is enabled or not. Default is True.
|
||||
tag_prefixes (Optional[List[str]]): Optional list of tag prefixes to restrict tag propagation.
|
||||
If provided, only tags with prefixes in this list will be propagated. Default is None,
|
||||
meaning all tags will be propagated.
|
||||
|
||||
Note:
|
||||
Tag propagation allows tags to be automatically propagated to downstream entities.
|
||||
Enabling tag propagation can help maintain consistent metadata across connected entities.
|
||||
The `enabled` attribute controls whether tag propagation is enabled or disabled.
|
||||
The `tag_prefixes` attribute can be used to specify a list of tag prefixes that define which tags
|
||||
should be propagated. If no prefixes are specified (default), all tags will be propagated.
|
||||
|
||||
Example:
|
||||
config = TagPropagationConfig(enabled=True, tag_prefixes=["urn:li:tag:"])
|
||||
"""
|
||||
|
||||
enabled: bool = Field(
|
||||
True,
|
||||
description="Indicates whether tag propagation is enabled or not.",
|
||||
)
|
||||
tag_prefixes: Optional[List[str]] = Field(
|
||||
None,
|
||||
description="Optional list of tag prefixes to restrict tag propagation.",
|
||||
examples=[
|
||||
"urn:li:tag:classification",
|
||||
],
|
||||
)
|
||||
|
||||
@validator("tag_prefixes", each_item=True)
|
||||
def tag_prefix_should_start_with_urn(cls, v: str) -> str:
|
||||
if v:
|
||||
return make_tag_urn(v)
|
||||
return v
|
||||
|
||||
|
||||
class TagPropagationDirective(BaseModel):
|
||||
propagate: bool
|
||||
tag: str
|
||||
operation: str
|
||||
entity: str
|
||||
|
||||
|
||||
class TagPropagationAction(Action):
|
||||
def __init__(self, config: TagPropagationConfig, ctx: PipelineContext):
|
||||
self.config: TagPropagationConfig = config
|
||||
self.ctx = ctx
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict, ctx):
|
||||
config = TagPropagationConfig.parse_obj(config_dict or {})
|
||||
logger.info(f"TagPropagationAction configured with {config}")
|
||||
return cls(config, ctx)
|
||||
|
||||
def name(self) -> str:
|
||||
return "TagPropagator"
|
||||
|
||||
def should_propagate(
|
||||
self, event: EventEnvelope
|
||||
) -> Optional[TagPropagationDirective]:
|
||||
"""
|
||||
Return a tag urn to propagate or None if no propagation is desired
|
||||
"""
|
||||
if event.event_type == "EntityChangeEvent_v1":
|
||||
assert isinstance(event.event, EntityChangeEvent)
|
||||
assert self.ctx.graph is not None
|
||||
semantic_event = event.event
|
||||
if semantic_event.category == "TAG" and (
|
||||
semantic_event.operation == "ADD"
|
||||
or semantic_event.operation == "REMOVE"
|
||||
):
|
||||
assert semantic_event.modifier, "tag urn should be present"
|
||||
propagate = self.config.enabled
|
||||
if self.config.tag_prefixes:
|
||||
propagate = any(
|
||||
[
|
||||
True
|
||||
for prefix in self.config.tag_prefixes
|
||||
if semantic_event.modifier.startswith(prefix)
|
||||
]
|
||||
)
|
||||
if not propagate:
|
||||
logger.debug(f"Not propagating {semantic_event.modifier}")
|
||||
if propagate:
|
||||
return TagPropagationDirective(
|
||||
propagate=True,
|
||||
tag=semantic_event.modifier,
|
||||
operation=semantic_event.operation,
|
||||
entity=semantic_event.entityUrn,
|
||||
)
|
||||
else:
|
||||
return TagPropagationDirective(
|
||||
propagate=False,
|
||||
tag=semantic_event.modifier,
|
||||
operation=semantic_event.modifier,
|
||||
entity=semantic_event.entityUrn,
|
||||
)
|
||||
return None
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
tag_propagation_directive = self.should_propagate(event)
|
||||
if tag_propagation_directive is not None:
|
||||
if tag_propagation_directive.propagate:
|
||||
# find downstream lineage
|
||||
assert self.ctx.graph
|
||||
entity_urn: str = tag_propagation_directive.entity
|
||||
downstreams = self.ctx.graph.get_downstreams(entity_urn)
|
||||
logger.info(
|
||||
f"Detected {len(downstreams)} downstreams for {entity_urn}: {downstreams}"
|
||||
)
|
||||
logger.info(
|
||||
f"Detected {tag_propagation_directive.tag} {tag_propagation_directive.operation} on {tag_propagation_directive.entity}"
|
||||
)
|
||||
# apply tags to downstreams
|
||||
for d in downstreams:
|
||||
self.ctx.graph.add_tags_to_dataset(
|
||||
d,
|
||||
[tag_propagation_directive.tag],
|
||||
context={
|
||||
"propagated": True,
|
||||
"origin": tag_propagation_directive.entity,
|
||||
},
|
||||
)
|
||||
else:
|
||||
logger.debug(f"Not propagating {tag_propagation_directive.tag}")
|
||||
|
||||
def close(self) -> None:
|
||||
return
|
103
datahub-actions/src/datahub_actions/plugin/action/teams/teams.py
Normal file
103
datahub-actions/src/datahub_actions/plugin/action/teams/teams.py
Normal file
@ -0,0 +1,103 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
import pymsteams
|
||||
from pydantic import SecretStr
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.metadata.schema_classes import EntityChangeEventClass as EntityChangeEvent
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
from datahub_actions.utils.datahub_util import DATAHUB_SYSTEM_ACTOR_URN
|
||||
from datahub_actions.utils.social_util import (
|
||||
get_message_from_entity_change_event,
|
||||
get_welcome_message,
|
||||
pretty_any_text,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=1, period=1) # 1 call per second
|
||||
def post_message(message_card, message):
|
||||
message_card.text(message)
|
||||
message_card.send()
|
||||
|
||||
|
||||
class TeamsNotificationConfig(ConfigModel):
|
||||
webhook_url: SecretStr
|
||||
base_url: str = "http://localhost:9002/"
|
||||
suppress_system_activity: bool = True
|
||||
|
||||
|
||||
class TeamsNotificationAction(Action):
|
||||
def name(self):
|
||||
return "TeamsNotificationAction"
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
action_config = TeamsNotificationConfig.parse_obj(config_dict or {})
|
||||
logger.info(f"Teams notification action configured with {action_config}")
|
||||
return cls(action_config, ctx)
|
||||
|
||||
def _new_card(self):
|
||||
return pymsteams.connectorcard(
|
||||
self.action_config.webhook_url.get_secret_value()
|
||||
)
|
||||
|
||||
def __init__(self, action_config: TeamsNotificationConfig, ctx: PipelineContext):
|
||||
self.action_config = action_config
|
||||
self.ctx = ctx
|
||||
welcome_card = self._new_card()
|
||||
structured_message = get_welcome_message(self.action_config.base_url)
|
||||
welcome_card.title(structured_message.title)
|
||||
message_section = pymsteams.cardsection()
|
||||
for k, v in structured_message.properties.items():
|
||||
message_section.addFact(k, pretty_any_text(v, channel="teams"))
|
||||
welcome_card.addSection(message_section)
|
||||
post_message(welcome_card, structured_message.text)
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
try:
|
||||
message = json.dumps(json.loads(event.as_json()), indent=4)
|
||||
logger.debug(f"Received event: {message}")
|
||||
if event.event_type == "EntityChangeEvent_v1":
|
||||
assert isinstance(event.event, EntityChangeEvent)
|
||||
if (
|
||||
event.event.auditStamp.actor == DATAHUB_SYSTEM_ACTOR_URN
|
||||
and self.action_config.suppress_system_activity
|
||||
):
|
||||
return None
|
||||
|
||||
semantic_message = get_message_from_entity_change_event(
|
||||
event.event,
|
||||
self.action_config.base_url,
|
||||
self.ctx.graph.graph if self.ctx.graph else None,
|
||||
channel="teams",
|
||||
)
|
||||
message_card = self._new_card()
|
||||
post_message(message_card, semantic_message)
|
||||
else:
|
||||
logger.debug("Skipping message because it didn't match our filter")
|
||||
except Exception as e:
|
||||
logger.debug("Failed to process event", e)
|
@ -0,0 +1,56 @@
|
||||
# Glossary Term Propagation Action
|
||||
|
||||
The Glossary Term Propagation Action allows you to propagate glossary terms from your assets into downstream entities.
|
||||
|
||||
## Use Cases
|
||||
|
||||
Enable classification of datasets or field of datasets to propagate metadata to downstream datasets with minimum manual work.
|
||||
|
||||
## Functionality
|
||||
|
||||
Propagation can be controlled via a specified list of terms or a specified list of term groups.
|
||||
|
||||
### Target Terms
|
||||
|
||||
- Given a list of "target terms", the propagation action will detect application of the target term to any field or dataset and propagate it down (as a dataset-level tag) on all downstream datasets. For example, given a target term of `Classification.Confidential` (the default), if you apply `Classification.Confidential` term to a dataset (at the dataset level or a field-level), this action will find all the downstream datasets and apply the `Classification.Confidential` tag to them at the dataset level. Note that downstream application is only at the dataset level, regardless of whether the primary application was at the field level or the dataset level.
|
||||
- This action also supports term linkage. If you apply a term that is linked to the target term via inheritance, then this action will detect that application and propagate it downstream as well. For example, if the term `PersonalInformation.Email` inherits `Classification.Confidential` (the target term), and if you apply the `PersonalInformation.Email` term to a dataset (or a field in the dataset), it will be picked up by the action, and the `PersonalInformation.Email` term will be applied at the dataset level to all the downstream entities.
|
||||
|
||||
### Term Groups
|
||||
|
||||
- Given a list of "term groups", the propagation action will only propagate terms that belong to these term groups.
|
||||
|
||||
### Addition and Removals
|
||||
|
||||
The action supports propagation of term additions and removals.
|
||||
|
||||
## Configurability
|
||||
|
||||
You can control what the target term should be. Linkage to the target term is controlled through your business glossary which is completely under your control.
|
||||
|
||||
### Example Config
|
||||
|
||||
```yaml
|
||||
name: "term_propagation"
|
||||
source:
|
||||
type: "kafka"
|
||||
config:
|
||||
connection:
|
||||
bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092}
|
||||
schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081}
|
||||
filter:
|
||||
event_type: "EntityChangeEvent_v1"
|
||||
action:
|
||||
type: "term_propagation"
|
||||
config:
|
||||
target_terms:
|
||||
- Classification
|
||||
term_groups:
|
||||
- "Personal Information"
|
||||
|
||||
datahub:
|
||||
server: "http://localhost:8080"
|
||||
```
|
||||
|
||||
## Caveats
|
||||
|
||||
- Term Propagation is currently only supported for downstream datasets. Terms will not propagate to downstream dashboards or charts. Let us know if this is an important feature for you.
|
@ -0,0 +1,13 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,192 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.metadata.schema_classes import EntityChangeEventClass as EntityChangeEvent
|
||||
from datahub_actions.action.action import Action
|
||||
from datahub_actions.event.event_envelope import EventEnvelope
|
||||
from datahub_actions.pipeline.pipeline_context import PipelineContext
|
||||
from datahub_actions.plugin.action.utils.term_resolver import GlossaryTermsResolver
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TermPropagationDirective(BaseModel):
|
||||
propagate: bool
|
||||
term: str
|
||||
operation: str
|
||||
entity: str
|
||||
|
||||
|
||||
class TermPropagationConfig(ConfigModel):
|
||||
"""
|
||||
Configuration model for term propagation.
|
||||
|
||||
Attributes:
|
||||
enabled (bool): Indicates whether term propagation is enabled or not. Default is True.
|
||||
target_term (Optional[str]): Optional target term to restrict term propagation.
|
||||
If provided, only this specific term and terms related to it via `isA` relationship will be propagated.
|
||||
Default is None, meaning all terms will be propagated.
|
||||
term_groups (Optional[List[str]]): Optional list of term groups to restrict term propagation.
|
||||
If provided, only terms within these groups will be propagated. Default is None, meaning all term groups will be propagated.
|
||||
|
||||
Note:
|
||||
Term propagation allows terms to be automatically propagated to downstream entities.
|
||||
Enabling term propagation can help maintain consistent metadata across connected entities.
|
||||
The `enabled` attribute controls whether term propagation is enabled or disabled.
|
||||
The `target_terms` attribute can be used to specify a set of specific terms or all terms related to these specific terms that should be propagated.
|
||||
The `term_groups` attribute can be used to specify a list of term groups to restrict propagation to.
|
||||
|
||||
Example:
|
||||
config = TermPropagationConfig(enabled=True, target_terms=["urn:li:glossaryTerm:Sensitive"])
|
||||
"""
|
||||
|
||||
enabled: bool = Field(
|
||||
True,
|
||||
description="Indicates whether term propagation is enabled or not.",
|
||||
)
|
||||
target_terms: Optional[List[str]] = Field(
|
||||
None,
|
||||
description="Optional target terms to restrict term propagation to this and all terms related to these terms.",
|
||||
examples=[
|
||||
"urn:li:glossaryTerm:Sensitive",
|
||||
],
|
||||
)
|
||||
term_groups: Optional[List[str]] = Field(
|
||||
None,
|
||||
description="Optional list of term groups to restrict term propagation.",
|
||||
examples=[
|
||||
"Group1",
|
||||
"Group2",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TermPropagationAction(Action):
|
||||
def __init__(self, config: TermPropagationConfig, ctx: PipelineContext):
|
||||
self.config = config
|
||||
self.ctx = ctx
|
||||
self.term_resolver = GlossaryTermsResolver(graph=self.ctx.graph)
|
||||
if self.config.target_terms:
|
||||
logger.info(
|
||||
f"[Config] Will propagate terms that inherit from terms {self.config.target_terms}"
|
||||
)
|
||||
resolved_terms = []
|
||||
for t in self.config.target_terms:
|
||||
if t.startswith("urn:li:glossaryTerm"):
|
||||
resolved_terms.append(t)
|
||||
else:
|
||||
resolved_term = self.term_resolver.get_glossary_term_urn(t)
|
||||
if not resolved_term:
|
||||
raise Exception(f"Failed to resolve term by name {t}")
|
||||
resolved_terms.append(resolved_term)
|
||||
self.config.target_terms = resolved_terms
|
||||
logger.info(
|
||||
f"[Config] Will propagate terms that inherit from terms {self.config.target_terms}"
|
||||
)
|
||||
|
||||
if self.config.term_groups:
|
||||
resolved_nodes = []
|
||||
for node in self.config.term_groups:
|
||||
if node.startswith("urn:li:glossaryNode"):
|
||||
resolved_nodes.append(node)
|
||||
else:
|
||||
resolved_node = self.term_resolver.get_glossary_node_urn(node)
|
||||
if not resolved_node:
|
||||
raise Exception(f"Failed to resolve node by name {node}")
|
||||
resolved_nodes.append(resolved_node)
|
||||
self.config.term_groups = resolved_nodes
|
||||
logger.info(
|
||||
f"[Config] Will propagate all terms in groups {self.config.term_groups}"
|
||||
)
|
||||
|
||||
def name(self) -> str:
|
||||
return "TermPropagator"
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Action":
|
||||
action_config = TermPropagationConfig.parse_obj(config_dict or {})
|
||||
logger.info(f"Term Propagation Config action configured with {action_config}")
|
||||
return cls(action_config, ctx)
|
||||
|
||||
def should_propagate(
|
||||
self, event: EventEnvelope
|
||||
) -> Optional[TermPropagationDirective]:
|
||||
if event.event_type == "EntityChangeEvent_v1":
|
||||
assert isinstance(event.event, EntityChangeEvent)
|
||||
assert self.ctx.graph is not None
|
||||
semantic_event = event.event
|
||||
if (
|
||||
semantic_event.category == "GLOSSARY_TERM"
|
||||
and self.config is not None
|
||||
and self.config.enabled
|
||||
):
|
||||
assert semantic_event.modifier
|
||||
for target_term in self.config.target_terms or [
|
||||
semantic_event.modifier
|
||||
]:
|
||||
# a cheap way to handle optionality and always propagate if config is not set
|
||||
# Check which terms have connectivity to the target term
|
||||
if (
|
||||
semantic_event.modifier == target_term
|
||||
or self.ctx.graph.check_relationship( # term has been directly applied # term is indirectly associated
|
||||
target_term,
|
||||
semantic_event.modifier,
|
||||
"IsA",
|
||||
)
|
||||
):
|
||||
return TermPropagationDirective(
|
||||
propagate=True,
|
||||
term=semantic_event.modifier,
|
||||
operation=semantic_event.operation,
|
||||
entity=semantic_event.entityUrn,
|
||||
)
|
||||
return None
|
||||
|
||||
def act(self, event: EventEnvelope) -> None:
|
||||
"""This method responds to changes to glossary terms and propagates them to downstream entities"""
|
||||
|
||||
term_propagation_directive = self.should_propagate(event)
|
||||
|
||||
if (
|
||||
term_propagation_directive is not None
|
||||
and term_propagation_directive.propagate
|
||||
):
|
||||
assert self.ctx.graph
|
||||
# find downstream lineage
|
||||
downstreams = self.ctx.graph.get_downstreams(
|
||||
entity_urn=term_propagation_directive.entity
|
||||
)
|
||||
|
||||
# apply terms to downstreams
|
||||
for dataset in downstreams:
|
||||
self.ctx.graph.add_terms_to_dataset(
|
||||
dataset,
|
||||
[term_propagation_directive.term],
|
||||
context={
|
||||
"propagated": True,
|
||||
"origin": term_propagation_directive.entity,
|
||||
},
|
||||
)
|
||||
logger.info(
|
||||
f"Will add term {term_propagation_directive.term} to {dataset}"
|
||||
)
|
||||
|
||||
def close(self) -> None:
|
||||
return
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user