diff --git a/src/MappingValidator/MappingValidator.php b/src/MappingValidator/MappingValidator.php index b381f5c..2ff02f0 100644 --- a/src/MappingValidator/MappingValidator.php +++ b/src/MappingValidator/MappingValidator.php @@ -1,423 +1,492 @@ sparqlClient = new SparqlClient(); $this->cache = $cache; } /** * @return ValidationMessage[] */ public function validate( string $vocabularyPrefix, string $vocabularyDefinitionUrl ): array { $messages = []; if ( !filter_var( $vocabularyPrefix, FILTER_VALIDATE_URL ) ) { $messages += [ new ValidationMessage( ValidationMessage::LEVEL_ERROR, "{$vocabularyDefinitionUrl} is not a valid URL" ) ]; return $messages; } $triples = $this->loadTriples( $vocabularyDefinitionUrl ); $count = $triples->count(); if ( $count === 0 ) { $messages += [ new ValidationMessage( ValidationMessage::LEVEL_INFO, "No machine readable description of the {$vocabularyPrefix} vocabulary found. " . "Please set an URL to a RDF definition of the vocabulary " . "in the 'Vocabulary RDF definition' parameter." ) ]; return $messages; } $messages += [ new ValidationMessage( ValidationMessage::LEVEL_INFO, "{$count} triple extracted from the definition file {$vocabularyDefinitionUrl}" ) ]; // We do checks for equivalent class // We first build a WD => external mapping // with error if there is a conflict and do some preliminary checks $entityToClassMapping = []; $classToEntityMapping = []; foreach ( $this->pairsForWikidataProperties( [ 'P1709' ], $vocabularyPrefix ) as $mapping ) { $uri = $mapping['uri']; $entity = $mapping['entity']; // We check that all equivalent classes have the rdfs:Class type if ( !$this->isClass( $uri, $triples ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_ERROR, "{$this->formatLink($uri)} is mapped " . "as equivalent class to {$this->formatLink($entity)} " . "but is not an instance of rdfs:Class or owl:Class." ); } // We check that all equivalent classes are used on property entities if ( strpos( $entity, 'http://www.wikidata.org/entity/Q' ) !== 0 ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_ERROR, "{$this->formatLink($uri)} not is mapped " . "to a Wikidata item but to {$this->formatLink($entity)}." ); } // Multiple WD entities maps to the same thing if ( array_key_exists( $entity, $entityToClassMapping ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, "{$this->formatLink($entity)} is mapped as equivalent class to both " . "{$this->formatLink($entityToClassMapping[$entity])} and {$this->formatLink($uri)}." ); } // The same WD entity maps to multiple things if ( array_key_exists( $uri, $classToEntityMapping ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, "{$this->formatLink($uri)} is mapped as equivalent class from both " . "{$this->formatLink($classToEntityMapping[$uri])} and {$this->formatLink($entity)}." ); } $entityToClassMapping[$entity] = $uri; $classToEntityMapping[$uri] = $entity; } // We check that, if an element is subclass of an other in Wikidata, // it is sub class of in the ontology // TODO: make it do more than 1 hop $wdSubClassOfHierarchy = $this->wikidataDirectChildOf( array_keys( $entityToClassMapping ), 'P279' ); foreach ( $wdSubClassOfHierarchy as $map ) { $sub = $map['sub']; $super = $map['super']; if ( !$this->isInSuperHierarchy( $entityToClassMapping[$sub], $entityToClassMapping[$super], self::RDFS_SUB_CLASS_OF, $triples ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, "{$this->formatLink($sub)} is a sub class of {$this->formatLink($super)} but " . "{$this->formatLink($entityToClassMapping[$sub])} is not a sub class of " . "{$this->formatLink($entityToClassMapping[$super])}." ); } } // We do checks for equivalent properties // We first build a WD => external mapping // with error if there is a conflict and do some preliminary checks $entityToPropertyMapping = []; $propertyToEntityMapping = []; foreach ( $this->pairsForWikidataProperties( [ 'P1628' ], $vocabularyPrefix ) as $mapping ) { $uri = $mapping['uri']; $entity = $mapping['entity']; $this->validateIsBetweenProperty( $entity, $uri, $messages, $triples ); // Multiple WD entities maps to the same thing if ( array_key_exists( $entity, $entityToPropertyMapping ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, "{$this->formatLink($entity)} is mapped as equivalent property to both " . "{$this->formatLink($entityToPropertyMapping[$entity])} and {$this->formatLink($uri)}." ); } // The same WD entity maps to multiple things if ( array_key_exists( $uri, $propertyToEntityMapping ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, "{$this->formatLink($uri)} is mapped as equivalent property from both " . "{$this->formatLink($propertyToEntityMapping[$uri])} and {$this->formatLink($entity)}." ); } $entityToPropertyMapping[$entity] = $uri; $propertyToEntityMapping[$uri] = $entity; } // We check that, if an element is subproperty of an other in Wikidata, // it is sub property of it in the ontology // TODO: make it do more than 1 hop $wdSubPropertyOfHierarchy = $this->wikidataDirectChildOf( array_keys( $entityToPropertyMapping ), 'P1647' ); foreach ( $wdSubPropertyOfHierarchy as $map ) { $sub = $map['sub']; $super = $map['super']; if ( !$this->isInSuperHierarchy( $entityToPropertyMapping[$sub], $entityToPropertyMapping[$super], self::RDFS_SUB_PROPERTY_OF, $triples ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, "{$this->formatLink($sub)} is a sub property of {$this->formatLink($super)} but " . "{$this->formatLink($entityToPropertyMapping[$sub])} is not a sub property of " . "{$this->formatLink($entityToPropertyMapping[$super])}." ); } } // We do checks for sub/super properties foreach ( $this->pairsForWikidataProperties( [ 'P2235', 'P2236' ], $vocabularyPrefix ) as $mapping ) { $this->validateIsBetweenProperty( $mapping['entity'], $mapping['uri'], $messages, $triples ); } // We do checks for exact match properties + $entityToUriMatchMapping = []; + $uriToEntityMatchMapping = []; foreach ( $this->pairsForWikidataProperties( [ 'P2888' ], $vocabularyPrefix ) as $mapping ) { + $uri = $mapping['uri']; + $entity = $mapping['entity']; // We check that all target of property mappings are not classes or properties - if ( $this->isProperty( $mapping['uri'], $triples ) ) { + if ( $this->isProperty( $uri, $triples ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, - "{$this->formatLink($mapping['uri'])} is an exact match of " . - "{$this->formatLink($mapping['entity'])} " . + "{$this->formatLink($uri)} is an exact match of " . + "{$this->formatLink($entity)} " . "and is a property, you may want to use " . "equivalent class (P1709) " . "instead." ); } - if ( $this->isClass( $mapping['uri'], $triples ) ) { + if ( $this->isClass( $uri, $triples ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_WARNING, - "{$this->formatLink($mapping['uri'])} is an exact match of " . - "{$this->formatLink($mapping['entity'])} " . + "{$this->formatLink($uri)} is an exact match of " . + "{$this->formatLink($entity)} " . "and is a property, you may want to use " . "equivalent property (P1628) " . "instead." ); } // We check that all target of property mappings are used on item entities if ( strpos( $mapping['entity'], 'http://www.wikidata.org/entity/Q' ) !== 0 ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_ERROR, "{$this->formatLink($mapping['uri'])} not is mapped " . "to a Wikidata item but to {$this->formatLink($mapping['entity'])}." ); } + + // Multiple WD entities maps to the same thing + if ( array_key_exists( $entity, $entityToUriMatchMapping ) ) { + $messages[] = new ValidationMessage( + ValidationMessage::LEVEL_WARNING, + "{$this->formatLink($entity)} is mapped as exact match to both " . + "{$this->formatLink($entityToUriMatchMapping[$entity])} and {$this->formatLink($uri)}." + ); + } + if ( + array_key_exists( $entity, $entityToPropertyMapping ) && + $entityToPropertyMapping[$entity] !== $uri + ) { + $messages[] = new ValidationMessage( + ValidationMessage::LEVEL_WARNING, + "{$this->formatLink($entity)} is mapped as exact match to " . + "{$this->formatLink($uri)} " . + "and as equivalent property to {$this->formatLink($entityToPropertyMapping[$entity])}." + ); + } + if ( + array_key_exists( $entity, $entityToClassMapping ) && + $entityToClassMapping[$entity] !== $uri + ) { + $messages[] = new ValidationMessage( + ValidationMessage::LEVEL_WARNING, + "{$this->formatLink($entity)} is mapped as exact match to " . + "{$this->formatLink($uri)} " . + "and as equivalent class to {$this->formatLink($entityToClassMapping[$entity])}." + ); + } + + // The same WD entity maps to multiple things + if ( array_key_exists( $uri, $uriToEntityMatchMapping ) ) { + $messages[] = new ValidationMessage( + ValidationMessage::LEVEL_WARNING, + "{$this->formatLink($entity)} is mapped as exact match from both " . + "{$this->formatLink($uriToEntityMatchMapping[$uri])} and {$this->formatLink($entity)}." + ); + } + if ( + array_key_exists( $uri, $propertyToEntityMapping ) && + $propertyToEntityMapping[$uri] !== $entity + ) { + $messages[] = new ValidationMessage( + ValidationMessage::LEVEL_WARNING, + "{$this->formatLink($uri)} is mapped as exact match from " . + "{$this->formatLink($entity)} " . + "and as equivalent property from {$this->formatLink($propertyToEntityMapping[$uri])}." + ); + } + if ( + array_key_exists( $uri, $classToEntityMapping ) && + $classToEntityMapping[$uri] !== $entity + ) { + $messages[] = new ValidationMessage( + ValidationMessage::LEVEL_WARNING, + "{$this->formatLink($uri)} is mapped as exact match from " . + "{$this->formatLink($entity)} " . + "and as equivalent class from {$this->formatLink($classToEntityMapping[$uri])}." + ); + } + + $entityToUriMatchMapping[$entity] = $uri; + $uriToEntityMatchMapping[$uri] = $entity; } if ( !$this->withErrorOrWarning( $messages ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_INFO, "Mapping have been validated without any error or warning found" ); } $this->loadLabels(); return $this->formatMessages( $messages ); } private function loadTriples( $documentUri ) { $key = 'tptools-mapping-' . md5( $documentUri ); $triples = $this->cache->get( $key ); if ( $triples === null ) { $triples = $this->doLoadTriples( $documentUri ); $this->cache->set( $key, $triples, 60 * 60 * 24 ); } return new SimpleTripleStore( $triples ); } private function doLoadTriples( $documentUri ) { if ( strpos( $documentUri, 'http://' ) === 0 ) { // Hack to avoid compatibility problems between ARC2 and HSTS $documentUris = [ str_replace( 'http://', 'https://', $documentUri ), $documentUri ]; } else { $documentUris = [ $documentUri ]; } foreach ( $documentUris as $documentUri ) { $parser = ARC2::getRDFParser(); $parser->parse( $documentUri ); $triples = $parser->getTriples(); if ( !empty( $triples ) ) { return $triples; } } return []; } private function validateIsBetweenProperty( $entity, $uri, &$messages, SimpleTripleStore $triples ) { // We check that all target of property mappings have the rdf:Property type if ( !$this->isProperty( $uri, $triples ) ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_ERROR, "{$this->formatLink($uri)} is mapped to " . "{$this->formatLink($entity)} " . "but is not an instance of rdf:Property, " . "owl:DatatypeProperty or owl:ObjectProperty." ); } // We check that all target of property mappings are used on property entities if ( strpos( $entity, 'http://www.wikidata.org/entity/P' ) !== 0 ) { $messages[] = new ValidationMessage( ValidationMessage::LEVEL_ERROR, "{$this->formatLink($uri)} not is mapped " . "to a Wikidata property but to {$this->formatLink($entity)}." ); } } private function pairsForWikidataProperties( $propertyIds, $vocabularyPrefix ) { $propertyPath = implode( '|', array_map( function ( $propertyId ) { return 'wdt:' . $propertyId; }, $propertyIds ) ); return $this->sparqlClient->getTuples( 'SELECT DISTINCT ?entity ?uri WHERE {' . ' ?entity (' . $propertyPath . ') ?uri .' . ' FILTER(STRSTARTS(STR(?uri), "' . $vocabularyPrefix . '"))' . '}' ); } private function wikidataDirectChildOf( $itemUris, $propertyId ) { $values = implode( ' ', array_map( function ( $uri ) { return '<' . $uri . '>'; }, $itemUris ) ); return $this->sparqlClient->getTuples( 'SELECT DISTINCT ?sub ?super WHERE {' . ' VALUES ?sub { ' . $values . '}' . ' VALUES ?super { ' . $values . '}' . ' ?sub wdt:' . $propertyId . ' ?super .' . '}' ); } private function isInSuperHierarchy( $start, $end, $relation, SimpleTripleStore $triples, &$seen = [] ) { if ( $start === $end ) { return true; } if ( in_array( $start, $seen ) ) { return false; } $seen[] = $start; foreach ( $triples->objects( $start, $relation ) as $parentStart ) { if ( $this->isInSuperHierarchy( $parentStart, $end, $relation, $triples, $seen ) ) { return true; } } return false; } private function formatLink( $uri ) { if ( strpos( $uri, 'http://www.wikidata.org/entity/' ) === 0 ) { $id = str_replace( 'http://www.wikidata.org/entity/', '', $uri ); $this->entitiesWithoutLabels[$id] = $id; return "$$id$"; } return "{$uri}"; } /** * @param ValidationMessage[] $messages * @return bool */ private function withErrorOrWarning( $messages ) { foreach ( $messages as $message ) { switch ( $message->getLevel() ) { case ValidationMessage::LEVEL_ERROR: case ValidationMessage::LEVEL_WARNING: return true; } } return false; } private function isClass( $uri, SimpleTripleStore $triples ) { return $triples->contains( $uri, self::RDF_TYPE, self::RDFS_CLASS ) || $triples->contains( $uri, self::RDF_TYPE, self::OWL_CLASS ); } private function isProperty( $uri, SimpleTripleStore $triples ) { return $triples->contains( $uri, self::RDF_TYPE, self::RDF_PROPERTY ) || $triples->contains( $uri, self::RDF_TYPE, self::OWL_DATATYPE_PROPERTY ) || $triples->contains( $uri, self::RDF_TYPE, self::OWL_OBJECT_PROPERTY ); } private function formatMessages( array $messages ) { return array_map( function ( ValidationMessage $message ) { return new ValidationMessage( $message->getLevel(), preg_replace_callback( '/\$([PQ]\d+)\$/', function ( $args ) { $entityId = $args[1]; if ( array_key_exists( $entityId, $this->entityLabels ) ) { return " {$this->entityLabels[$entityId]} ({$entityId}) "; } return $args[1]; }, $message->getMessage() ) ); }, $messages ); } private function loadLabels() { $entitiesToGet = []; foreach ( $this->entitiesWithoutLabels as $entityId ) { $label = $this->cache->get( 'tptools-entity-label-en-' . $entityId ); if ( $label !== null ) { $this->entityLabels[$entityId] = $label; } else { $entitiesToGet[] = $entityId; } } if ( !empty( $entitiesToGet ) ) { $values = implode( ' ', array_map( function ( $id ) { return 'wd:' . $id; }, $entitiesToGet ) ); foreach ( $this->sparqlClient->getTuples( 'SELECT ?entity ?entityLabel WHERE {' . ' VALUES ?entity { ' . $values . ' }' . ' SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }' . '}' ) as $tuple ) { $entityId = str_replace( 'http://www.wikidata.org/entity/', '', $tuple['entity'] ); $label = $tuple['entityLabel']; $this->cache->set( 'tptools-entity-label-en-' . $entityId, $label, 60 * 60 * 24 * 30 ); $this->entityLabels[$entityId] = $label; } } } }