diff --git a/src/main/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaService.java b/src/main/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaService.java index 6d40f5ef..a6a97cf8 100644 --- a/src/main/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaService.java +++ b/src/main/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaService.java @@ -1,342 +1,342 @@ package org.wikimedia.commons.donvip.spacemedia.service.agencies; import java.io.IOException; import java.net.URISyntaxException; import java.net.URL; import java.time.Duration; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.http.converter.HttpMessageNotReadableException; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import org.springframework.util.CollectionUtils; import org.springframework.web.client.HttpClientErrorException.Forbidden; import org.springframework.web.client.RestClientException; import org.springframework.web.client.RestTemplate; import org.wikimedia.commons.donvip.spacemedia.data.local.ProblemRepository; import org.wikimedia.commons.donvip.spacemedia.data.local.Statistics; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaAssets; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaAudio; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaAudioRepository; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaCollection; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaImage; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaImageRepository; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaItem; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaLink; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaMedia; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaMediaRepository; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaMediaType; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaResponse; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaVideo; import org.wikimedia.commons.donvip.spacemedia.data.local.nasa.NasaVideoRepository; import org.wikimedia.commons.donvip.spacemedia.service.MediaService; import org.wikimedia.commons.donvip.spacemedia.utils.Geo; import org.wikimedia.commons.donvip.spacemedia.utils.Utils; @Service public class NasaService extends SpaceAgencyService { private static final Logger LOGGER = LoggerFactory.getLogger(NasaService.class); @Value("${nasa.search.link}") private String searchEndpoint; @Value("${nasa.min.year}") private int minYear; @Value("${nasa.max.tries}") private int maxTries; @Value("${nasa.centers}") private Set nasaCenters; @Autowired private NasaAudioRepository audioRepository; @Autowired private NasaImageRepository imageRepository; @Autowired private NasaVideoRepository videoRepository; @Autowired private MediaService mediaService; @Autowired private NasaMediaRepository mediaRepository; @Autowired public NasaService(NasaMediaRepository repository, ProblemRepository problemrepository) { super(repository, problemrepository); } private NasaMedia save(NasaMedia media) { switch (media.getMediaType()) { case image: return imageRepository.save((NasaImage) media); case video: return videoRepository.save((NasaVideo) media); case audio: return audioRepository.save((NasaAudio) media); } throw new IllegalArgumentException(media.toString()); } static Optional findOriginalMedia(RestTemplate rest, URL href) throws URISyntaxException { return rest.getForObject(Utils.urlToUri(href), NasaAssets.class).stream() .filter(u -> u.toExternalForm().contains("~orig.")).findFirst(); } private NasaMedia processMedia(RestTemplate rest, NasaMedia media, URL href) throws IOException, URISyntaxException { Optional mediaInRepo = repository.findById(media.getNasaId()); boolean save = false; if (mediaInRepo.isPresent()) { // allow to purge keywords table and recreate contents Set keywordsInRepo = mediaInRepo.get().getKeywords(); Set keywordsFromNasa = media.getKeywords(); media = mediaInRepo.get(); if (CollectionUtils.isEmpty(keywordsInRepo) && !CollectionUtils.isEmpty(keywordsFromNasa)) { media.setKeywords(keywordsFromNasa); save = true; } } else { save = true; } // The API is supposed to send us keywords in a proper JSON array, but sometimes it is not Set normalizedKeywords = normalizeKeywords(media.getKeywords()); if (!Objects.equals(normalizedKeywords, media.getKeywords())) { media.setKeywords(normalizedKeywords); save = true; } if (media.getAssetUrl() == null) { Optional originalUrl = findOriginalMedia(rest, href); if (originalUrl.isPresent()) { media.setAssetUrl(originalUrl.get()); save = true; } } if (mediaService.computeSha1(media, media.getAssetUrl())) { save = true; } if (mediaService.findCommonsFilesWithSha1(media)) { save = true; } if (save) { media = save(media); } if (!nasaCenters.contains(media.getCenter())) { problem(media.getAssetUrl(), "Unknown center for id '" + media.getNasaId() + "': " + media.getCenter()); } if (media.getNasaId().length() < 3) { problem(media.getAssetUrl(), "Strange id: '" + media.getNasaId() + "'"); } return media; } static Set normalizeKeywords(Set keywords) { if (keywords != null && keywords.size() == 1) { String kw = keywords.iterator().next(); for (String sep : Arrays.asList(",", ";")) { if (kw.contains(sep) && looksLikeMultipleValues(kw, sep)) { return Arrays.stream(kw.split(sep)).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toSet()); } } } return keywords; } private static final Pattern PATTERN_NUMBER = Pattern.compile(".*\\d+,\\d+.*"); private static final Pattern PATTERN_DATE = Pattern.compile(".*\\p{Alpha}+\\.? \\d{1,2}, [12]\\d{3}.*"); private static final Pattern PATTERN_A_AND_B = Pattern.compile(".*[\\p{Alpha}\\.]+, (and|&) \\p{Alpha}+.*"); private static final Pattern PATTERN_A_B_AND_C = Pattern.compile(".*\\p{Alpha}+, \\p{Alpha}+ (and|&) \\p{Alpha}+.*"); - private static final Pattern PATTERN_ER = Pattern.compile(".*\\p{Alpha}+er, \\p{Alpha}+er.*"); + private static final Pattern PATTERN_ER = Pattern.compile("[^,]*\\p{Alpha}+er, \\p{Alpha}+er[^,]*"); private static boolean looksLikeMultipleValues(String kw, String sep) { if (",".equals(sep)) { if (kw.startsWith("Hi, ") || kw.contains(", by ")) { return false; } if (kw.endsWith(sep)) { kw = kw.substring(0, kw.length()-sep.length()); } if (kw.contains(sep)) { String after = kw.substring(kw.lastIndexOf(sep) + sep.length() + " ".length()); for (Collection entities : Arrays.asList( Geo.CONTINENTS, Geo.COUNTRIES, Geo.STATES, Geo.STATE_CODES, Geo.NORTH_SOUTH_STATES)) { if (entities.contains(after)) { return false; } } for (Pattern pattern : Arrays.asList( PATTERN_NUMBER, PATTERN_DATE, PATTERN_A_AND_B, PATTERN_A_B_AND_C, PATTERN_ER)) { if (pattern.matcher(kw).matches()) { return false; } } } } return true; } @SuppressWarnings("unchecked") private String processSearchResults(RestTemplate rest, String searchUrl, List medias) { LOGGER.debug("Fetching {}", searchUrl); boolean ok = false; for (int i = 0; i < maxTries && !ok; i++) { try { NasaCollection collection = rest.getForObject(searchUrl, NasaResponse.class).getCollection(); ok = true; for (NasaItem item : collection.getItems()) { try { medias.add((T) processMedia(rest, item.getData().get(0), item.getHref())); } catch (Forbidden e) { problem(item.getHref(), e.getMessage()); } catch (RestClientException e) { if (e.getCause() instanceof HttpMessageNotReadableException) { problem(item.getHref(), e.getCause().getMessage()); } else { LOGGER.error("Cannot process item " + item, e); } } catch (IOException | URISyntaxException e) { LOGGER.error("Cannot process item " + item, e); } } if (!CollectionUtils.isEmpty(collection.getLinks())) { Optional next = collection.getLinks().stream().filter(l -> "next".equals(l.getRel())).findFirst(); if (next.isPresent()) { return next.get().getHref().toExternalForm(); } } } catch (RestClientException e) { LOGGER.error("Unable to process search results for " + searchUrl, e); } } return null; } private List doUpdateMedia(NasaMediaType mediaType) { return doUpdateMedia(mediaType, minYear, LocalDateTime.now().getYear(), null); } private List doUpdateMedia(NasaMediaType mediaType, int year, Set centers) { return doUpdateMedia(mediaType, year, year, centers); } private List doUpdateMedia(NasaMediaType mediaType, int startYear, int endYear, Set centers) { LocalDateTime start = LocalDateTime.now(); logStartUpdate(mediaType, startYear, endYear, centers); final List medias = new ArrayList<>(); RestTemplate rest = new RestTemplate(); String nextUrl = searchEndpoint + "media_type=" + mediaType + "&year_start=" + startYear + "&year_end=" + endYear; if (centers != null) { nextUrl += "¢er=" + String.join(",", centers); } while (nextUrl != null) { nextUrl = processSearchResults(rest, nextUrl, medias); } logEndUpdate(mediaType, startYear, endYear, centers, start, medias.size()); return medias; } private static void logStartUpdate(NasaMediaType mediaType, int startYear, int endYear, Set centers) { if (centers == null) { if (startYear == endYear) { LOGGER.info("NASA {} update for year {} started...", mediaType, startYear); } else { LOGGER.info("NASA {} update for years {}-{} started...", mediaType, startYear, endYear); } } else if (startYear == endYear && centers.size() == 1) { LOGGER.info("NASA {} update for year {} center {} started...", mediaType, startYear, centers.iterator().next()); } else { LOGGER.info("NASA {} update for years {}-{} center {} started...", mediaType, startYear, endYear, centers); } } private static void logEndUpdate(NasaMediaType mediaType, int startYear, int endYear, Set centers, LocalDateTime start, int size) { Duration duration = Duration.between(LocalDateTime.now(), start); if (centers == null) { if (startYear == endYear) { LOGGER.info("NASA {} update for year {} completed: {} {}s in {}", mediaType, startYear, size, mediaType, duration); } else { LOGGER.info("NASA {} update for years {}-{} completed: {} {}s in {}", mediaType, startYear, endYear, size, mediaType, duration); } } else if (startYear == endYear && centers.size() == 1) { LOGGER.info("NASA {} update for year {} center {} completed: {} {}s in {}", mediaType, startYear, centers.iterator().next(), size, mediaType, duration); } else { LOGGER.info("NASA {} update for years {}-{} center {} completed: {} {}s in {}", mediaType, startYear, endYear, centers.iterator().next(), size, mediaType, duration); } } @Scheduled(fixedRateString = "${nasa.update.rate}", initialDelayString = "${initial.delay}") public List updateImages() { List images = new ArrayList<>(); // Recent years have a lot of photos: search by center to avoid more than 10k results for (int year = LocalDateTime.now().getYear(); year >= 2000; year--) { for (String center : nasaCenters) { images.addAll(doUpdateMedia(NasaMediaType.image, year, Collections.singleton(center))); } } // Ancient years have a lot less photos: simple search for all centers for (int year = 1999; year >= minYear; year--) { images.addAll(doUpdateMedia(NasaMediaType.image, year, null)); } return images; } @Scheduled(fixedRateString = "${nasa.update.rate}", initialDelayString = "${initial.delay}") public List updateAudios() { return doUpdateMedia(NasaMediaType.audio); } @Scheduled(fixedRateString = "${nasa.update.rate}", initialDelayString = "${initial.delay}") public List updateVideos() { return doUpdateMedia(NasaMediaType.video); } @Override public List updateMedia() { LocalDateTime start = LocalDateTime.now(); LOGGER.info("Starting NASA medias update..."); final List medias = new ArrayList<>(); medias.addAll(updateImages()); medias.addAll(updateAudios()); medias.addAll(updateVideos()); LOGGER.info("NASA medias update completed: {} medias in {}", medias.size(), Duration.between(LocalDateTime.now(), start)); return medias; } @Override public String getName() { return "NASA"; } @Override public Statistics getStatistics() { Statistics stats = super.getStatistics(); List centers = mediaRepository.listCenters(); if (centers.size() > 1) { stats.setDetails(centers.stream() .map(c -> new Statistics(Objects.toString(c), mediaRepository.countByCenter(c), mediaRepository.countMissingInCommonsByCenter(c), null)) .sorted().collect(Collectors.toList())); } return stats; } } diff --git a/src/test/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaServiceTest.java b/src/test/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaServiceTest.java index 97d5c037..004a2e6d 100644 --- a/src/test/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaServiceTest.java +++ b/src/test/java/org/wikimedia/commons/donvip/spacemedia/service/agencies/NasaServiceTest.java @@ -1,106 +1,108 @@ package org.wikimedia.commons.donvip.spacemedia.service.agencies; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.stream.Collectors; import org.junit.Test; import org.springframework.web.client.RestTemplate; import org.wikimedia.commons.donvip.spacemedia.service.agencies.NasaService; public class NasaServiceTest { @Test public void testFindOriginalMedia() throws Exception { RestTemplate rest = new RestTemplate(); for (URL href : Arrays.asList( "https://images-assets.nasa.gov/video/EarthKAM_espa%C3%B1ol_V2/collection.json", "https://images-assets.nasa.gov/video/EarthKAM_español_V2/collection.json", "https://images-assets.nasa.gov/video/NHQ_2017_0171_VF_NASA%20KEPLER%20OPENS%20THE%20STUDY%20OF%20THE%20GALAXY%E2%80%99S%20PLANET%20POPULATION/collection.json", "https://images-assets.nasa.gov/video/NHQ_2017_0171_VF_NASA KEPLER OPENS THE STUDY OF THE GALAXY’S PLANET POPULATION/collection.json", "https://images-assets.nasa.gov/video/NHQ_2017_0908_Irma%20Tracked%20from%20Space%20on%20This%20Week%20@NASA%20%E2%80%93%20September%208,%202017/collection.json", "https://images-assets.nasa.gov/video/NHQ_2017_0908_Irma Tracked from Space on This Week @NASA – September 8, 2017/collection.json" ).stream().map(s -> { try { return new URL(s); } catch (MalformedURLException e) { throw new RuntimeException(e); } }).collect(Collectors.toList())) { assertNotNull(NasaService.findOriginalMedia(rest, href)); } } @Test public void testKeywordsSplit() throws Exception { doTestKeywords("MSFC; National Space Advisory Council; U.S. Space and Rocket Cen", Arrays.asList("MSFC", "National Space Advisory Council", "U.S. Space and Rocket Cen")); doTestKeywords("Human Exploration Rover Challenge; U.S. Space and Rocket Center;", Arrays.asList("Human Exploration Rover Challenge", "U.S. Space and Rocket Center")); doTestKeywords("NASA,Jet Propulsion Laboratory,JPL,space,exploration,planets,InSight,lander,Interior Exploration using Seismic Investigations,Geodesy and Heat Transport,Martian wind,Marsforming,AR,instrument deployment,SEIS,Seismic Experiment for Interior Structure,Curiosity,Mars Science Laboratory,MSL,science,Mars,planet,news,robot,robotics,tech,technology,augmented reality,hololens", Arrays.asList("NASA","Jet Propulsion Laboratory","JPL","space","exploration","planets","InSight","lander","Interior Exploration using Seismic Investigations","Geodesy and Heat Transport","Martian wind","Marsforming","AR","instrument deployment","SEIS","Seismic Experiment for Interior Structure","Curiosity","Mars Science Laboratory","MSL","science","Mars","planet","news","robot","robotics","tech","technology","augmented reality","hololens")); doTestKeywords("Chandra X-ray Observatory,NuSTAR", Arrays.asList("Chandra X-ray Observatory","NuSTAR")); doTestKeywords("iss,", Arrays.asList("iss")); + doTestKeywords("NASA, JPL, Jet Propulsion Laboratory, GRACE, GRACE Follow-On, Gravity Recovery and Climate Experiment, water, water cycle, launch, Falcon 9, SpaceX, rocket, GFZ, German Research Centre for Geosciences, gravity, measurements, sea level rise, glaciers, ice sheets, Greenland, Antarctica, melting, aquifer, groundwater, soil moisture, droughts, floods, lakes, rivers, climate change, Vandenberg Air Force Base, VAFB, movement, mass changes, weather forecasting, microwave instrument, laser ranging interferometer", + Arrays.asList("NASA", "JPL", "Jet Propulsion Laboratory", "GRACE", "GRACE Follow-On", "Gravity Recovery and Climate Experiment", "water", "water cycle", "launch", "Falcon 9", "SpaceX", "rocket", "GFZ", "German Research Centre for Geosciences", "gravity", "measurements", "sea level rise", "glaciers", "ice sheets", "Greenland", "Antarctica", "melting", "aquifer", "groundwater", "soil moisture", "droughts", "floods", "lakes", "rivers", "climate change", "Vandenberg Air Force Base", "VAFB", "movement", "mass changes", "weather forecasting", "microwave instrument", "laser ranging interferometer")); } @Test public void testKeywordsNoSplitDates() throws Exception { doTestKeywords("USA Composite Reveals Massive Winter Storm - January 02, 2014", Arrays.asList("USA Composite Reveals Massive Winter Storm - January 02, 2014")); doTestKeywords("Erupting Prominence Observed by SDO on March 30, 2010", Arrays.asList("Erupting Prominence Observed by SDO on March 30, 2010")); doTestKeywords("C3-class Solar Flare Erupts on Sept. 8, 2010 [Detail]", Arrays.asList("C3-class Solar Flare Erupts on Sept. 8, 2010 [Detail]")); } @Test public void testKeywordsNoSplitVariousStuff() throws Exception { doTestKeywords("Kulusuk Icebergs, by Andrew Bossi", Arrays.asList("Kulusuk Icebergs, by Andrew Bossi")); doTestKeywords("Hi, Hokusai!", Arrays.asList("Hi, Hokusai!")); doTestKeywords("U.S. Senate Committee on Commerce, Science and Transportation", Arrays.asList("U.S. Senate Committee on Commerce, Science and Transportation")); doTestKeywords("Entry, Descent and Landing (EDL)", Arrays.asList("Entry, Descent and Landing (EDL)")); doTestKeywords("NASA's SDO Satellite Captures Venus Transit Approach -- Bigger, Better!", Arrays.asList("NASA's SDO Satellite Captures Venus Transit Approach -- Bigger, Better!")); } @Test public void testKeywordsNoSplitContinentsCountriesStates() throws Exception { doTestKeywords("Partial Eclipse Seen Over the Princess Ragnhild Coast, Antarctica", Arrays.asList("Partial Eclipse Seen Over the Princess Ragnhild Coast, Antarctica")); doTestKeywords("Eastern Hudson Bay, Canada", Arrays.asList("Eastern Hudson Bay, Canada")); doTestKeywords("Landsat View: Western Suburbs of Chicago, Illinois", Arrays.asList("Landsat View: Western Suburbs of Chicago, Illinois")); doTestKeywords("Satellite Sees Holiday Lights Brighten Cities - Washington, D.C., and Baltimore", Arrays.asList("Satellite Sees Holiday Lights Brighten Cities - Washington, D.C., and Baltimore")); doTestKeywords("Smoke from Fires in Southwestern Oregon, Northern California", Arrays.asList("Smoke from Fires in Southwestern Oregon, Northern California")); doTestKeywords("Worcester, MA", Arrays.asList("Worcester, MA")); doTestKeywords("Washington, DC", Arrays.asList("Washington, DC")); } @Test public void testKeywordsNoSplitNumbers() throws Exception { doTestKeywords("Hubble views a spectacular supernova with interstellar material over 160,000 light-years away", Arrays.asList("Hubble views a spectacular supernova with interstellar material over 160,000 light-years away")); } private static void doTestKeywords(String string, List asList) { assertEquals(new HashSet<>(asList), NasaService.normalizeKeywords(Collections.singleton(string))); } }