diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 5e2d31a803..223b395f3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -1,5 +1,6 @@ """LDAP Source""" import dataclasses +import re from typing import Any, Dict, Iterable, List, Optional import ldap @@ -22,6 +23,7 @@ from datahub.metadata.schema_classes import ( CorpGroupSnapshotClass, CorpUserInfoClass, CorpUserSnapshotClass, + GroupMembershipClass, ) # default mapping for attrs @@ -42,6 +44,7 @@ user_attrs_map["departmentId"] = "departmentNumber" user_attrs_map["title"] = "title" user_attrs_map["departmentName"] = "departmentNumber" user_attrs_map["countryCode"] = "countryCode" +user_attrs_map["memberOf"] = "memberOf" # group related attrs group_attrs_map["urn"] = "cn" @@ -94,6 +97,7 @@ class LDAPSourceConfig(ConfigModel): # Extraction configuration. base_dn: str = Field(description="LDAP DN.") filter: str = Field(default="(objectClass=*)", description="LDAP extractor filter.") + attrs_list: List[str] = Field(default=None, description="Retrieved attributes list") # If set to true, any users without first and last names will be dropped. drop_missing_first_last_name: bool = Field( @@ -201,6 +205,7 @@ class LDAPSource(Source): self.config.base_dn, ldap.SCOPE_SUBTREE, self.config.filter, + self.config.attrs_list, serverctrls=[self.lc], ) _rtype, rdata, _rmsgid, serverctrls = self.ldap_client.result3(msgid) @@ -228,6 +233,7 @@ class LDAPSource(Source): elif ( b"posixGroup" in attrs["objectClass"] or b"organizationalUnit" in attrs["objectClass"] + or b"groupOfNames" in attrs["objectClass"] or b"group" in attrs["objectClass"] ): yield from self.handle_group(dn, attrs) @@ -301,6 +307,7 @@ class LDAPSource(Source): full_name = attrs[self.config.user_attrs_map["fullName"]][0].decode() first_name = attrs[self.config.user_attrs_map["firstName"]][0].decode() last_name = attrs[self.config.user_attrs_map["lastName"]][0].decode() + groups = parse_groups(attrs, self.config.user_attrs_map["memberOf"]) email = ( (attrs[self.config.user_attrs_map["email"]][0]).decode() @@ -334,27 +341,30 @@ class LDAPSource(Source): ) manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None - return MetadataChangeEvent( - proposedSnapshot=CorpUserSnapshotClass( - urn=f"urn:li:corpuser:{ldap_user}", - aspects=[ - CorpUserInfoClass( - active=True, - email=email, - fullName=full_name, - firstName=first_name, - lastName=last_name, - departmentId=department_id, - departmentName=department_name, - displayName=display_name, - countryCode=country_code, - title=title, - managerUrn=manager_urn, - ) - ], - ) + user_snapshot = CorpUserSnapshotClass( + urn=f"urn:li:corpuser:{ldap_user}", + aspects=[ + CorpUserInfoClass( + active=True, + email=email, + fullName=full_name, + firstName=first_name, + lastName=last_name, + departmentId=department_id, + departmentName=department_name, + displayName=display_name, + countryCode=country_code, + title=title, + managerUrn=manager_urn, + ) + ], ) + if groups: + user_snapshot.aspects.append(GroupMembershipClass(groups=groups)) + + return MetadataChangeEvent(proposedSnapshot=user_snapshot) + def build_corp_group_mce(self, attrs: dict) -> Optional[MetadataChangeEvent]: """Creates a MetadataChangeEvent for LDAP groups.""" cn = attrs.get(self.config.group_attrs_map["urn"]) @@ -417,3 +427,19 @@ def strip_ldap_info(input_clean: bytes) -> str: """Converts a b'uid=username,ou=Groups,dc=internal,dc=machines' format to username""" return input_clean.decode().split(",")[0].lstrip("uid=") + + +def parse_groups(attrs: Dict[str, Any], filter_key: str) -> List[str]: + """Converts a list of LDAP groups to Datahub corpgroup strings""" + if filter_key in attrs: + return [ + f"urn:li:corpGroup:{strip_ldap_group_cn(ldap_group)}" + for ldap_group in attrs[filter_key] + ] + return [] + + +def strip_ldap_group_cn(input_clean: bytes) -> str: + """Converts a b'cn=group_name,ou=Groups,dc=internal,dc=machines' + format to group name""" + return re.sub("cn=", "", input_clean.decode().split(",")[0], flags=re.IGNORECASE) diff --git a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json index 2f4c4defca..c88815bfba 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json @@ -151,5 +151,93 @@ "runId": "ldap-test", "properties": null } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:hbevan", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Hester Bevan", + "email": "hbevan", + "title": null, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": "Hester", + "lastName": "Bevan", + "fullName": "Hester Bevan", + "countryCode": null + } + } + ] + } + }, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "ldap-test", + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:ehaas", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Evalyn Haas", + "email": "ehaas", + "title": null, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": "Evalyn", + "lastName": "Haas", + "fullName": "Evalyn Haas", + "countryCode": null + } + } + ] + } + }, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "ldap-test", + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot": { + "urn": "urn:li:corpGroup:HR Department", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { + "displayName": null, + "email": "HR Department", + "admins": [], + "members": [], + "groups": [], + "description": null + } + } + ] + } + }, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "ldap-test", + "properties": null + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/ldap/ldap_memberof_mces_golden.json b/metadata-ingestion/tests/integration/ldap/ldap_memberof_mces_golden.json new file mode 100644 index 0000000000..1ac0c691f3 --- /dev/null +++ b/metadata-ingestion/tests/integration/ldap/ldap_memberof_mces_golden.json @@ -0,0 +1,82 @@ +[ +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:hbevan", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Hester Bevan", + "email": "hbevan", + "title": null, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": "Hester", + "lastName": "Bevan", + "fullName": "Hester Bevan", + "countryCode": null + } + }, + { + "com.linkedin.pegasus2avro.identity.GroupMembership": { + "groups": [ + "urn:li:corpGroup:HR Department" + ] + } + } + ] + } + }, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "ldap-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:ehaas", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Evalyn Haas", + "email": "ehaas", + "title": null, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": "Evalyn", + "lastName": "Haas", + "fullName": "Evalyn Haas", + "countryCode": null + } + }, + { + "com.linkedin.pegasus2avro.identity.GroupMembership": { + "groups": [ + "urn:li:corpGroup:HR Department" + ] + } + } + ] + } + }, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "ldap-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/ldap/setup/custom/sample.ldif b/metadata-ingestion/tests/integration/ldap/setup/custom/sample.ldif index eb8c86f45d..ecb39d64ea 100644 --- a/metadata-ingestion/tests/integration/ldap/setup/custom/sample.ldif +++ b/metadata-ingestion/tests/integration/ldap/setup/custom/sample.ldif @@ -9,6 +9,19 @@ version: 1 +# Enable memberOf overlay +dn: olcOverlay=memberof,olcDatabase={1}mdb,cn=config +objectClass: olcMemberOf +objectClass: olcOverlayConfig +objectClass: olcConfig +objectClass: top +olcOverlay: memberof +olcMemberOfDangling: ignore +olcMemberOfRefInt: TRUE +olcMemberOfGroupOC: groupOfNames +olcMemberOfMemberAD: member +olcMemberOfMemberOfAD: memberOf + # Entry 1: dc=example,dc=org # Note: this is commented out because the containers bootstrap scripts already # handle this for us. @@ -113,3 +126,39 @@ dn: ou=Sales Department,dc=example,dc=org objectclass: organizationalUnit objectclass: top ou: Sales Department + +# Entry 10: cn=Hester Bevan,ou=people,dc=example,dc=org +dn: cn=Hester Bevan,ou=people,dc=example,dc=org +cn: Hester Bevan +gidnumber: 500 +givenname: Hester +homedirectory: /home/users/hbevan +objectclass: inetOrgPerson +objectclass: posixAccount +objectclass: top +sn: Bevan +uid: hbevan +uidnumber: 1004 +userpassword: {MD5}4QrcOUm6Wau+VuBX8g+IPg== + +# Entry 11: cn=Evalyn Haas,ou=people,dc=example,dc=org +dn: cn=Evalyn Haas,ou=people,dc=example,dc=org +cn: Evalyn Haas +gidnumber: 500 +givenname: Evalyn +homedirectory: /home/users/ehaas +objectclass: inetOrgPerson +objectclass: posixAccount +objectclass: top +sn: Haas +uid: ehaas +uidnumber: 1005 +userpassword: {MD5}4QrcOUm6Wau+VuBX8g+IPg== + +# Entry 12: cn=HR Department,ou=groups,dc=example,dc=org +dn: cn=HR Department,dc=example,dc=org +cn: HR Department +objectclass: groupOfNames +objectclass: top +member: cn=Hester Bevan,ou=people,dc=example,dc=org +member: cn=Evalyn Haas,ou=people,dc=example,dc=org \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/ldap/test_ldap.py b/metadata-ingestion/tests/integration/ldap/test_ldap.py index 6d96bbda2b..3899238887 100644 --- a/metadata-ingestion/tests/integration/ldap/test_ldap.py +++ b/metadata-ingestion/tests/integration/ldap/test_ldap.py @@ -50,3 +50,50 @@ def test_ldap_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): output_path=tmp_path / "ldap_mces.json", golden_path=test_resources_dir / "ldap_mces_golden.json", ) + + +@pytest.mark.integration +def test_ldap_memberof_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): + test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap" + + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "ldap" + ) as docker_services: + # The openldap container loads the sample data after exposing the port publicly. As such, + # we must wait a little bit extra to ensure that the sample data is loaded. + wait_for_port(docker_services, "openldap", 389) + time.sleep(5) + + pipeline = Pipeline.create( + { + "run_id": "ldap-test", + "source": { + "type": "ldap", + "config": { + "ldap_server": "ldap://localhost", + "ldap_user": "cn=admin,dc=example,dc=org", + "ldap_password": "admin", + "base_dn": "dc=example,dc=org", + "filter": "(memberOf=cn=HR Department,dc=example,dc=org)", + "attrs_list": ["+", "*"], + "group_attrs_map": { + "members": "member", + }, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/ldap_memberof_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "ldap_memberof_mces.json", + golden_path=test_resources_dir / "ldap_memberof_mces_golden.json", + )