diff --git a/public_html/index.php b/public_html/index.php index bf8ceb9..627c737 100644 --- a/public_html/index.php +++ b/public_html/index.php @@ -1,480 +1,480 @@ 'A service to translate links based on Wikipedia language links, use it like: ?p=Earth|Moon|Human|Water&from=en&to=de Source: github.com/ebraminio/linkstranslator']; } $fromWiki = strtolower($fromWiki); if (preg_match('/^[a-z_]{1,20}$/', $fromWiki) === 0) { return ['#error' => 'Invalid "from" is provided']; }; - if (preg_match('/wiki/', $fromWiki) === 0) { $fromWiki = $fromWiki . 'wiki'; } + if (preg_match('/.wiki/', $fromWiki) === 0) { $fromWiki = $fromWiki . 'wiki'; } if ($toWiki === 'info') { return $useDb ? (object)getLinksInfoSQL(array_values($pages), $fromWiki) : (object)getLinksInfo(array_values($pages), $fromWiki); } $toWiki = strtolower($toWiki); if (preg_match('/^[a-z_]{1,20}$/', $toWiki) === 0) { return ['#error' => 'Invalid "to" is provided']; }; - if (preg_match('/wiki$/', $toWiki) === 0) { $toWiki = $toWiki . 'wiki'; } + if (preg_match('/.wiki/', $toWiki) === 0) { $toWiki = $toWiki . 'wiki'; } $pages = array_unique($pages); $titlesMap = resolvePages($pages, $fromWiki); if ($fromWiki === $toWiki) { return (object)$titlesMap; } $resolvedPages = array_unique(array_values($titlesMap)); if ($toWiki === 'wikidatawiki') { $equs = $useDb ? getWikidataIdSQL($resolvedPages, $fromWiki) : getWikidataId($resolvedPages, $fromWiki); } elseif ($toWiki === 'imdbwiki') { $equs = getImdbIdWikidata($resolvedPages, $fromWiki); } else { $equs = $useDb ? getLocalNamesFromWikidataSQL($resolvedPages, $fromWiki, $toWiki) : getLocalNamesFromWikidata($resolvedPages, $fromWiki, $toWiki); } $result = []; foreach ($titlesMap as $p => $r) { if (isset($equs[$r])) { $result[$p] = $equs[$r]; } } if ($missings) { $missingsPages = array_diff($resolvedPages, array_keys($equs)); $missingsStats = $useDb ? getLinksInfoSQL($missingsPages, $fromWiki) : getLinksInfo($missingsPages, $fromWiki); $missingsResult = []; foreach ($titlesMap as $p => $r) { if (isset($missingsStats[$r])) { $missingsResult[$p] = $missingsStats[$r]; } } $result['#missings'] = (object)$missingsResult; } return (object)$result; } function getLinksInfo($pages, $fromWiki) { $host = dbNameToOrigin($fromWiki); $apiResult = multiRequest(array_map(function ($page) use ($host) { return [ 'url' => 'https://' . $host . '/w/api.php', 'post' => http_build_query([ 'action' => 'query', 'format' => 'json', 'prop' => 'langlinks|links', 'redirects' => '', 'pllimit' => '500', 'lllimit' => '500', 'titles' => $page ]) ]; }, $pages)); $missings = []; foreach ($apiResult as $a) { $x = json_decode($a, true); if (!isset($x['query']['pages'])) continue; $p = array_values($x['query']['pages'])[0]; $e = $p['title']; if (isset($x['query']['redirects'])) $e = $x['query']['redirects'][0]['from']; if (isset($x['query']['normalized'])) $e = $x['query']['normalized'][0]['from']; $missings[$e] = [ 'langlinks' => isset($p['langlinks']) ? count($p['langlinks']) : 0, 'links' => isset($p['links']) ? count($p['links']) : 0 ]; } return $missings; } function getLinksInfoSQL($rawPages, $fromWiki) { global $ini, $db; $pages = []; foreach ($rawPages as $p) { $pages[] = mysqli_real_escape_string($db, $p); } $localDb = mysqli_connect('enwiki.labsdb', $ini['user'], $ini['password'], $fromWiki . '_p'); $localPages = []; foreach ($pages as $p) { $localPages[] = str_replace(' ', '_', $p); } $query = " SELECT pl_title, COUNT(*) FROM pagelinks WHERE pl_from_namespace = 0 AND pl_namespace = 0 AND pl_title IN ('" . implode("', '", $localPages) . "') GROUP BY pl_title; "; $dbResult = mysqli_query($localDb, $query); if (!$dbResult) { error_log(mysqli_error($localDb)); error_log($query); return getLinksInfo($rawPages, $fromWiki); } $backlinks = []; while ($match = $dbResult->fetch_row()) { $backlinks[str_replace('_', ' ', $match[0])] = $match[1]; } mysqli_free_result($dbResult); mysqli_close($localDb); $query = " SELECT T1.ips_site_page, COUNT(*) FROM wb_items_per_site T1 INNER JOIN wb_items_per_site T2 ON T1.ips_item_id = T2.ips_item_id WHERE T1.ips_site_id = '$fromWiki' AND T1.ips_site_page IN ('" . implode("', '", $pages) . "') GROUP BY T1.ips_site_page "; $dbResult = mysqli_query($db, $query); if (!$dbResult) { error_log(mysqli_error($db)); error_log($query); return getLinksInfo($rawPages, $fromWiki); } $langlinks = []; while ($match = $dbResult->fetch_row()) { $langlinks[$match[0]] = $match[1]; } mysqli_free_result($dbResult); // merge results $missings = []; foreach ($pages as $p) { $missings[$p] = [ 'langlinks' => isset($langlinks[$p]) ? $langlinks[$p] - 1 : 0, 'links' => isset($backlinks[$p]) ? $backlinks[$p] + 0 : 0 ]; } return $missings; } function getImdbIdWikidata($pages, $fromWiki) { $apiResultArray = batchApi('wikidatawiki', $pages, function ($batch) use ($fromWiki) { return [ 'action' => 'wbgetentities', 'format' => 'json', 'sites' => $fromWiki, 'titles' => implode('|', $batch), 'props' => 'sitelinks|claims' ]; }); $entities = []; foreach ($apiResultArray as $i) { $json = json_decode($i, true); if (isset($json['entities'])) { foreach ($json['entities'] as $entity) { $entities[] = $entity; } } } $equs = []; foreach ($entities as $entity) { if (!isset($entity['sitelinks'])) { continue; } // not updated Wikidata items may don't have title on their sitelinks $from = isset($entity['sitelinks'][$fromWiki]['title']) ? $entity['sitelinks'][$fromWiki]['title'] : $entity['sitelinks'][$fromWiki]; if (!isset($entity['claims']['P345'][0]['mainsnak']['datavalue']['value'])) continue; $to = $entity['claims']['P345'][0]['mainsnak']['datavalue']['value']; $equs[$from] = $to; } return $equs; } function getWikidataId($pages, $fromWiki) { $apiResultArray = batchApi('wikidatawiki', $pages, function ($batch) use ($fromWiki) { return [ 'action' => 'wbgetentities', 'format' => 'json', 'sites' => $fromWiki, 'titles' => implode('|', $batch), 'props' => 'sitelinks' ]; }); $entities = []; foreach ($apiResultArray as $i) { $json = json_decode($i, true); if (isset($json['entities'])) { foreach ($json['entities'] as $entity) { $entities[] = $entity; } } } $equs = []; foreach ($entities as $entity) { // not updated Wikidata items may don't have title on their sitelinks $from = isset($entity['sitelinks'][$fromWiki]['title']) ? $entity['sitelinks'][$fromWiki]['title'] : $entity['sitelinks'][$fromWiki]; $equs[$from] = $entity['id']; } return $equs; } function getWikidataIdSQL($rawPages, $fromWiki) { global $db; $pages = []; foreach ($rawPages as $p) { $pages[] = mysqli_real_escape_string($db, $p); } $query = " SELECT CONCAT('Q', ips_item_id), ips_site_page FROM wb_items_per_site WHERE ips_site_page IN ('" . implode("', '", $pages) . "') AND ips_site_id = '$fromWiki' "; $dbResult = mysqli_query($db, $query); if (!$dbResult) { error_log(mysqli_error($db)); error_log($query); return getWikidataId($rawPages, $fromWiki); } $equs = []; while ($match = $dbResult->fetch_row()) { $equs[$match[1]] = $match[0]; } mysqli_free_result($dbResult); return $equs; } function getLocalNamesFromWikidataSQL($rawPages, $fromWiki, $toWiki) { global $db; $pages = []; foreach ($rawPages as $p) { $pages[] = mysqli_real_escape_string($db, $p); } $query = " SELECT T2.ips_site_page, T1.ips_site_page FROM wb_items_per_site T1 INNER JOIN wb_items_per_site T2 ON T1.ips_item_id = T2.ips_item_id AND T2.ips_site_id = '$toWiki' WHERE T1.ips_site_id = '$fromWiki' AND T1.ips_site_page IN ('" . implode("', '", $pages) . "') "; $dbResult = mysqli_query($db, $query); if (!$dbResult) { error_log(mysqli_error($db)); error_log($query); return getLocalNamesFromWikidata($rawPages, $fromWiki, $toWiki); } if (!$dbResult) { return []; } $equs = []; while ($match = $dbResult->fetch_row()) { $equs[$match[1]] = $match[0]; } mysqli_free_result($dbResult); return $equs; } function getLocalNamesFromWikidata($pages, $fromWiki, $toWiki) { $apiResultArray = batchApi('wikidatawiki', $pages, function ($batch) use ($fromWiki) { return [ 'action' => 'wbgetentities', 'format' => 'json', 'sites' => $fromWiki, 'titles' => implode('|', $batch), 'props' => 'sitelinks' ]; }); $entities = []; foreach ($apiResultArray as $i) { $json = json_decode($i, true); if (isset($json['entities'])) { foreach ($json['entities'] as $entity) { $entities[] = $entity; } } } $equs = []; foreach ($entities as $entity) { if (!isset($entity['sitelinks']) || !isset($entity['sitelinks'][$toWiki])) { continue; } // not updated Wikidata items may don't have title on their sitelinks $from = isset($entity['sitelinks'][$fromWiki]['title']) ? $entity['sitelinks'][$fromWiki]['title'] : $entity['sitelinks'][$fromWiki]; $to = isset($entity['sitelinks'][$toWiki]['title']) ? $entity['sitelinks'][$toWiki]['title'] : $entity['sitelinks'][$toWiki]; $equs[$from] = $to; } return $equs; } function resolvePages($pages, $fromWiki) { $apiResultArray = batchApi($fromWiki, $pages, function ($batch) { return [ 'action' => 'query', 'format' => 'json', 'redirects' => '', 'titles' => implode('|', $batch) ]; }); $normalizes = []; $redirects = []; $missings = []; foreach ($apiResultArray as $i) { $json = json_decode($i, true); if (!is_array($json) || !isset($json['query']['pages'])) { continue; } $query = $json['query']; $queryPages = $query['pages']; if (isset($query['normalized'])) { foreach ($query['normalized'] as $x) { $normalizes[$x['from']] = $x['to']; } } if (isset($query['redirects'])) { foreach ($query['redirects'] as $x) { $redirects[$x['from']] = $x['to']; } } foreach ($queryPages as $x) { if (isset($x['missing'])) { $missings[] = $x['title']; } } } $result = []; foreach ($pages as $p) { if (!in_array($p, $missings)) { $resolved = $p; if (isset($normalizes[$resolved])) { $resolved = $normalizes[$resolved]; } if (isset($redirects[$resolved])) { $resolved = $redirects[$resolved]; } $result[$p] = $resolved; } } return $result; } function dbNameToOrigin($dbName) { if ($dbName === 'wikidatawiki') { return 'www.wikidata.org'; } if ($dbName === 'commonswiki') { return 'commons.wikimedia.org'; } $p = explode('wiki', $dbName); return str_replace('_', '-', $p[0]) . '.wiki' . (isset($p[1]) && strlen($p[1]) ? $p[1] : 'pedia') . '.org'; } function batchApi($dbName, $pages, $requestCreator) { $host = dbNameToOrigin($dbName); $batches = array_chunk($pages, 50); return multiRequest(array_map(function ($data) use ($host, $requestCreator) { return [ 'url' => 'https://' . $host . '/w/api.php', 'post' => $requestCreator($data) ]; }, $batches)); } // http://www.phpied.com/simultaneuos-http-requests-in-php-with-curl/ function multiRequest($data, $options = array()) { // array of curl handles $curly = array(); // data to be returned $result = array(); // multi handle $mh = curl_multi_init(); // loop through $data and create curl handles // then add them to the multi-handle foreach ($data as $id => $d) { $curly[$id] = curl_init(); $url = (is_array($d) && !empty($d['url'])) ? $d['url'] : $d; curl_setopt($curly[$id], CURLOPT_URL, $url); curl_setopt($curly[$id], CURLOPT_HEADER, 0); curl_setopt($curly[$id], CURLOPT_RETURNTRANSFER, 1); curl_setopt($curly[$id], CURLOPT_USERAGENT, 'linkstranslator (github.com/ebraminio/linkstranslator)'); // post? if (is_array($d)) { if (!empty($d['post'])) { curl_setopt($curly[$id], CURLOPT_POST, 1); curl_setopt($curly[$id], CURLOPT_POSTFIELDS, $d['post']); } } // extra options? if (!empty($options)) { curl_setopt_array($curly[$id], $options); } curl_multi_add_handle($mh, $curly[$id]); } // execute the handles $running = null; do { curl_multi_exec($mh, $running); } while($running > 0); // get content and remove handles foreach($curly as $id => $c) { $result[$id] = curl_multi_getcontent($c); curl_multi_remove_handle($mh, $c); } // all done curl_multi_close($mh); return $result; }