diff --git a/doc/spacemedia.uml b/doc/spacemedia.uml index c5b4adbd..8de836ef 100644 --- a/doc/spacemedia.uml +++ b/doc/spacemedia.uml @@ -1,286 +1,286 @@ - +
diff --git a/doc/spacemedia_fr_FR.properties b/doc/spacemedia_fr_FR.properties index 2f5a39e5..3263a636 100644 --- a/doc/spacemedia_fr_FR.properties +++ b/doc/spacemedia_fr_FR.properties @@ -1 +1 @@ -#Sun May 30 01:37:45 CEST 2021 +#Sun May 30 02:48:30 CEST 2021 diff --git a/pom.xml b/pom.xml index 8629d27e..3fa280bb 100644 --- a/pom.xml +++ b/pom.xml @@ -1,86 +1,92 @@ 4.0.0 org.wikimedia.commons.donvip spacemedia 0.5.0-SNAPSHOT pom Find free media released by space agencies missing in Wikimedia Commons sm-apps sm-commons-api sm-commons-data + sm-core sm-data sm-legacyapp sm-repositories sm-utils 11 ${java.version} ${java.version} UTF-8 https://sonarcloud.io toolforge tool-spacemedia 4.4 3.0.6 v3-rev20210410-1.31.0 8.3.1 2.5.0 scm:git:https://phabricator.wikimedia.org/source/tool-spacemedia.git HEAD https://phabricator.wikimedia.org/source/tool-spacemedia/ org.springframework.boot spring-boot-dependencies ${springboot-version} pom import com.flickr4java flickr4java ${flickr4java-version} com.github.scribejava scribejava-apis ${scribejava-version} com.google.apis google-api-services-youtube ${google-api-services-youtube-version} org.apache.commons commons-collections4 ${collections4-version} + + org.jsoup + jsoup + 1.13.1 + geosolutions GeoSolutions Repository http://maven.geo-solutions.it jcenter https://jcenter.bintray.com/ diff --git a/sm-apps/pom.xml b/sm-apps/pom.xml index 07463c77..ef96abe0 100644 --- a/sm-apps/pom.xml +++ b/sm-apps/pom.xml @@ -1,37 +1,37 @@ 4.0.0 org.wikimedia.commons.donvip spacemedia 0.5.0-SNAPSHOT sm-apps pom Applications sm-backend sm-cronjobs sm-webapp org.wikimedia.commons.donvip - sm-data + sm-core ${project.version} org.springframework.boot spring-boot-starter org.mariadb.jdbc mariadb-java-client diff --git a/sm-apps/sm-backend/src/main/resources/logback-spring-toolforge.xml b/sm-apps/sm-backend/src/main/resources/logback-spring-toolforge.xml index df02b423..02d28ba8 100644 --- a/sm-apps/sm-backend/src/main/resources/logback-spring-toolforge.xml +++ b/sm-apps/sm-backend/src/main/resources/logback-spring-toolforge.xml @@ -1,9 +1,9 @@ - + diff --git a/sm-apps/sm-cronjobs/sm-harvester-esa-website/pom.xml b/sm-apps/sm-cronjobs/sm-harvester-esa-website/pom.xml index 92b44577..15a9d6ed 100644 --- a/sm-apps/sm-cronjobs/sm-harvester-esa-website/pom.xml +++ b/sm-apps/sm-cronjobs/sm-harvester-esa-website/pom.xml @@ -1,12 +1,19 @@ 4.0.0 org.wikimedia.commons.donvip sm-cronjobs 0.5.0-SNAPSHOT sm-harvester-esa-website Harvests media from ESA (European Space Agency) website + + + org.jsoup + jsoup + + + diff --git a/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/java/org/wikimedia/commons/donvip/spacemedia/harvester/esa/website/EsaWebsiteApplication.java b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/java/org/wikimedia/commons/donvip/spacemedia/harvester/esa/website/EsaWebsiteApplication.java new file mode 100644 index 00000000..e4c3d4d4 --- /dev/null +++ b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/java/org/wikimedia/commons/donvip/spacemedia/harvester/esa/website/EsaWebsiteApplication.java @@ -0,0 +1,12 @@ +package org.wikimedia.commons.donvip.spacemedia.harvester.esa.website; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.wikimedia.commons.donvip.spacemedia.core.AbstractHarvestingApplication; + +@SpringBootApplication +public class EsaWebsiteApplication extends AbstractHarvestingApplication { + + public static void main(String[] args) { + SpringApplication.run(EsaWebsiteApplication.class, args); + } +} diff --git a/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/java/org/wikimedia/commons/donvip/spacemedia/harvester/esa/website/service/EsaWebsiteScrappingService.java b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/java/org/wikimedia/commons/donvip/spacemedia/harvester/esa/website/service/EsaWebsiteScrappingService.java new file mode 100644 index 00000000..bc78342e --- /dev/null +++ b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/java/org/wikimedia/commons/donvip/spacemedia/harvester/esa/website/service/EsaWebsiteScrappingService.java @@ -0,0 +1,65 @@ +package org.wikimedia.commons.donvip.spacemedia.harvester.esa.website.service; + +import java.io.IOException; +import java.net.SocketTimeoutException; +import java.net.URL; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; +import org.wikimedia.commons.donvip.spacemedia.core.Harvester; + +@Service +public class EsaWebsiteScrappingService implements Harvester { + + private static final Logger LOGGER = LoggerFactory.getLogger(EsaWebsiteScrappingService.class); + + @Value("${esa.search.link}") + private String searchLink; + + @Value("${esa.max.tries}") + private int maxTries; + + @Value("${esa.date.pattern}") + private String datePattern; + + @Override + public void harvestMedia() throws IOException { + final URL url = new URL(searchLink); + final String proto = url.getProtocol(); + final String host = url.getHost(); + boolean moreImages = true; + int index = 0; + do { + String searchUrl = searchLink.replace("", Integer.toString(index)); + try { + boolean ok = false; + for (int i = 0; i < maxTries && !ok; i++) { + try { + LOGGER.debug("Fetching ESA images: {}", searchUrl); + Document html = Jsoup.connect(searchUrl).timeout(15_000).get(); + Elements divs = html.getElementsByClass("grid-item"); + for (Element div : divs) { + URL imageUrl = new URL(proto, host, div.select("a").get(0).attr("href")); + index++; + LOGGER.debug("Checking ESA image {}: {}", index, imageUrl); + } + moreImages = !html.getElementsByClass("paging").get(0) + .getElementsByAttributeValue("title", "Next").isEmpty(); + ok = true; + } catch (SocketTimeoutException e) { + LOGGER.debug(searchUrl, e); + } + } + } catch (IOException | RuntimeException e) { + LOGGER.error(searchUrl, e); + moreImages = false; + } + } while (moreImages); + } +} diff --git a/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/resources/application-dev.properties b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/resources/application-dev.properties new file mode 100644 index 00000000..5e7173ba --- /dev/null +++ b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/resources/application-dev.properties @@ -0,0 +1 @@ +logging.level.org.wikimedia.commons.donvip.spacemedia=DEBUG diff --git a/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/resources/application.properties b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/resources/application.properties new file mode 100644 index 00000000..684e91d0 --- /dev/null +++ b/sm-apps/sm-cronjobs/sm-harvester-esa-website/src/main/resources/application.properties @@ -0,0 +1,24 @@ +# Spring + +spring.jackson.default-property-inclusion=non_empty +spring.jackson.property-naming-strategy=SNAKE_CASE + +spring.jpa.defer-datasource-initialization=true +spring.jpa.hibernate.ddl-auto=update +spring.jpa.open-in-view=false +spring.jpa.show-sql=false +spring.jpa.properties.hibernate.format_sql=false + +# Local database for development +domain.datasource.url=jdbc:mariadb://localhost:3306/root_spacemedia +domain.datasource.username=root +domain.datasource.password= +domain.datasource.driver-class-name=org.mariadb.jdbc.Driver +domain.datasource.hikari.maximum-pool-size=2 +domain.datasource.hikari.max-lifetime=300000 +domain.datasource.hikari.connectionInitSql = SET NAMES 'utf8mb4' + +# ESA +esa.search.link = https://www.esa.int/ESA_Multimedia/Search/(offset)//(sortBy)/published?SearchText=by-sa +esa.max.tries = 5 +esa.date.pattern = dd/MM/yyyy diff --git a/sm-apps/pom.xml b/sm-core/pom.xml similarity index 61% copy from sm-apps/pom.xml copy to sm-core/pom.xml index 07463c77..f934274b 100644 --- a/sm-apps/pom.xml +++ b/sm-core/pom.xml @@ -1,37 +1,34 @@ 4.0.0 org.wikimedia.commons.donvip spacemedia 0.5.0-SNAPSHOT - sm-apps - pom - Applications - - - sm-backend - sm-cronjobs - sm-webapp - + sm-core + Core application classes org.wikimedia.commons.donvip sm-data - ${project.version} + ${project.version} - org.springframework.boot - spring-boot-starter + org.springframework + spring-beans + + + org.springframework + spring-context - org.mariadb.jdbc - mariadb-java-client + jakarta.annotation + jakarta.annotation-api diff --git a/sm-core/src/main/java/org/wikimedia/commons/donvip/spacemedia/core/AbstractHarvestingApplication.java b/sm-core/src/main/java/org/wikimedia/commons/donvip/spacemedia/core/AbstractHarvestingApplication.java new file mode 100644 index 00000000..847d5d45 --- /dev/null +++ b/sm-core/src/main/java/org/wikimedia/commons/donvip/spacemedia/core/AbstractHarvestingApplication.java @@ -0,0 +1,21 @@ +package org.wikimedia.commons.donvip.spacemedia.core; + +import java.io.IOException; + +import javax.annotation.PostConstruct; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.wikimedia.commons.donvip.spacemedia.data.DomainDbConfiguration; + +@Import(DomainDbConfiguration.class) +public abstract class AbstractHarvestingApplication { + + @Autowired + private Harvester harvester; + + @PostConstruct + public void init() throws IOException { + harvester.harvestMedia(); + } +} diff --git a/sm-core/src/main/java/org/wikimedia/commons/donvip/spacemedia/core/Harvester.java b/sm-core/src/main/java/org/wikimedia/commons/donvip/spacemedia/core/Harvester.java new file mode 100644 index 00000000..65c0bb5f --- /dev/null +++ b/sm-core/src/main/java/org/wikimedia/commons/donvip/spacemedia/core/Harvester.java @@ -0,0 +1,8 @@ +package org.wikimedia.commons.donvip.spacemedia.core; + +import java.io.IOException; + +public interface Harvester { + + void harvestMedia() throws IOException; +}