Commit 3b186651 authored by Daniel Eggert's avatar Daniel Eggert
Browse files

switched to dom4j for xml parsing

parent 5b733d5c
......@@ -9,6 +9,7 @@ import java.sql.Types;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
......@@ -39,7 +40,7 @@ public class MetadatacrawlerModule {
@SuppressWarnings("unused")
public static void main(String[] args) throws Exception {
Map<MetadataCrawler, List<DatasetMetadata>> crawlerTasks = new HashMap<>();
Map<MetadataCrawler, List<DatasetMetadata>> crawlerTasks = new LinkedHashMap<>();
//
// // crawl landsat 7 and 8 from usgs
MetadataCrawler usgs = new UsgsCrawler();
......@@ -50,16 +51,17 @@ public class MetadatacrawlerModule {
// }
// printProvidedDatasets(usgs);
// DatasetMetadata landsat7 = new DatasetMetadata();
// landsat7.id = 112;
// landsat7.name = "LANDSAT_ETM_SLC_OFF";
DatasetMetadata landsat7c1 = new DatasetMetadata();
landsat7c1.id = 251;
landsat7c1.name = "LANDSAT_ETM_C1";
DatasetMetadata landsat8c1 = new DatasetMetadata();
landsat8c1.id = 250;
landsat8c1.name = "LANDSAT_8_C1";
// DatasetMetadata aster_l1t = new DatasetMetadata();
// aster_l1t.id = 189;
// aster_l1t.name = "ASTER_L1T";
// crawlerTasks.put(usgs, Arrays.asList(landsat8c1));
// crawlerTasks.put(usgs, Arrays.asList(landsat7c1));
// crawlerTasks.put(usgs, Arrays.asList(aster_l1t, landsat7, landsat8));
// crawl sentinel2 from esa' scientific data hub
......@@ -69,7 +71,7 @@ public class MetadatacrawlerModule {
s2.name = "Sentinel-2";
crawlerTasks.put(scihub, Arrays.asList(s2));
final boolean queryLatestOnly = true;
final boolean queryLatestOnly = false;
// run crawler tasks
for (Entry<MetadataCrawler, List<DatasetMetadata>> e : crawlerTasks.entrySet()) {
......@@ -78,7 +80,7 @@ public class MetadatacrawlerModule {
// crawl datasets
for (DatasetMetadata dataset : datasets) {
crawlDataset(dataset, crawler, queryLatestOnly, null);// new Timestamp(2017, 5, 1));
crawlDataset(dataset, crawler, queryLatestOnly, new Timestamp(2017, 4, 1));
}
}
}
......
......@@ -47,7 +47,7 @@ public class CloudCoverUpdateFromMetadataUrl {
private static final Random rand = new Random();
public static void main(String[] args) throws Exception {
short datasetids[] = new short[] { 250 }; // , 108, 107, 112 };
short datasetids[] = new short[] { 251 }; // , 108, 107, 112 };
final SceneDatabase db = SceneDatabase.getInstance();
ExecutorService pool = Executors.newFixedThreadPool(20);
......
......@@ -176,6 +176,7 @@ public abstract class MetadataCrawler {
scene.sensorid = 2; // TM
scene.subsystemid = 0;
break;
case 251:
case 112:
// LANDSAT_7 slc_off
scene.satelliteid = 6;
......
......@@ -89,6 +89,7 @@ public class ScientificDataHubCrawler extends MetadataCrawler {
query.addParameter(SearchQuery.KEYWORD_BEGINPOSITION, "[" + startIso + " TO " + endIso + "]");
query.addParameter(SearchQuery.KEYWORD_PLATFORMNAME, dataset.name);
query.addParameter(SearchQuery.KEYWORD_PRODUCTTYPE, "S2MSI1C"); // limit to L1C products
}
BlockingQueue<SearchResultEntry> searchResultQueue = new LinkedBlockingQueue<>();
......
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.potsdam.gfz</groupId>
......@@ -37,5 +38,11 @@
<artifactId>slf4j-simple</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>org.dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
</project>
......@@ -3,28 +3,21 @@
*/
package de.potsdam.gfz.scihubapi.opensearch;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import de.potsdam.gfz.scihubapi.UrlUtils;
......@@ -35,25 +28,25 @@ import de.potsdam.gfz.scihubapi.UrlUtils;
public class SciHubSearch {
// logger
private static final Logger LOG = LoggerFactory.getLogger(SciHubSearch.class);
private static final Logger LOG = LoggerFactory.getLogger(SciHubSearch.class);
// URI/URL search constants
private static final String SEARCH_PROTOCOL_SCHEME = "https";
private static final String SEARCH_HOST = "scihub.copernicus.eu";
private static final String SEARCH_PATH = "/dhus/search";
private static final String SEARCH_QUERY_PREFIX = "q=";
private static final String SEARCH_PROTOCOL_SCHEME = "https";
private static final String SEARCH_HOST = "scihub.copernicus.eu";
private static final String SEARCH_PATH = "/dhus/search";
private static final String SEARCH_QUERY_PREFIX = "q=";
private final String base64Credentials;
private final String base64Credentials;
// query parameter constants
private static final int NUM_ITEMS_PER_PAGE = 100;
private static final int NUM_ITEMS_PER_PAGE = 100;
private static final int CONNECTION_RETRIES = 10;
private static final int CONNECTION_RETRIES = 10;
private boolean canceled = false;
private boolean running = false;
private boolean canceled = false;
private boolean running = false;
private SAXParser saxParser;
private final SciHubSearchResponseHandler handler = new SciHubSearchResponseHandler();
/**
*
......@@ -61,13 +54,6 @@ public class SciHubSearch {
public SciHubSearch(String username, String password) {
String userPass = username + ":" + password;
base64Credentials = javax.xml.bind.DatatypeConverter.printBase64Binary(userPass.getBytes());
// init parser
try {
saxParser = SAXParserFactory.newInstance().newSAXParser();
} catch (Exception e) {
e.printStackTrace();
}
}
public void search(final SearchQuery query, final BlockingQueue<SearchResultEntry> resultQueue) {
......@@ -97,6 +83,7 @@ public class SciHubSearch {
// append item constant
sb.append("&rows=");
sb.append(NUM_ITEMS_PER_PAGE);
// FIXME: change 'desc' back to 'asc'!!!
sb.append("&orderby=beginposition asc");
URL url = UrlUtils.getValidatedUrl(SEARCH_PROTOCOL_SCHEME, SEARCH_HOST, SEARCH_PATH, sb.toString());
......@@ -156,25 +143,52 @@ public class SciHubSearch {
// download the entire stream first
// this is necessary since the SAX Parser appears to miss parts of content strings when it can't read from the inputstream immediately
ByteArrayInputStream bis = UrlUtils.download(stream);
if (bis != null) {
// init xml parser
XMLReader xmlReader = saxParser.getXMLReader();
SciHubSearchResponseHandler handler = new SciHubSearchResponseHandler(resultQueue);
xmlReader.setContentHandler(handler);
try {
xmlReader.parse(new InputSource(bis));
} catch (IOException e) {
LOG.error("IO Error parsing response.", e);
}
// ByteArrayInputStream bis = UrlUtils.download(stream);
if (handler.hasFollowupLink()) {
followupURL = handler.getFollowupLink();
SAXReader reader = new SAXReader();
Document xml = null;
try {
xml = reader.read(stream);
} catch (DocumentException e1) {
e1.printStackTrace();
}
if (xml != null) {
// check for followup
Element root = xml.getRootElement();
for (Iterator<Element> it = root.elementIterator("link"); it.hasNext();) {
Element link = it.next();
String relVal = link.attributeValue("rel");
if (relVal != null && relVal.equals("next")) {
// follow up found
followupURL = link.attributeValue("href");
break;
}
}
handler.extractElements(root.elementIterator("entry"), resultQueue);
// Element root = xml.getRootElement();
}
// if (bis != null) {
//
// // init xml parser
// XMLReader xmlReader = saxParser.getXMLReader();
//
// xmlReader.setContentHandler(handler);
// try {
// xmlReader.parse(new InputSource(bis));
// } catch (IOException e) {
// LOG.error("IO Error parsing response.", e);
// }
//
// if (handler.hasFollowupLink()) {
// followupURL = handler.getFollowupLink();
// }
// }
return followupURL;
}
......@@ -199,7 +213,8 @@ public class SciHubSearch {
// define search query parameters
SearchQuery query = new SearchQuery();
query.addParameter(SearchQuery.KEYWORD_PLATFORMNAME, "Sentinel-2");
query.addParameter(SearchQuery.KEYWORD_BEGINPOSITION, "[NOW-4DAYS TO NOW]");
query.addParameter(SearchQuery.KEYWORD_BEGINPOSITION, "[2017-04-25T00:00:00Z TO 2017-04-25T23:59:59Z]");
query.addParameter(SearchQuery.KEYWORD_PRODUCTTYPE, "S2MSI2Ap");
BlockingQueue<SearchResultEntry> resultQueue = new LinkedBlockingQueue<>();
......@@ -211,9 +226,10 @@ public class SciHubSearch {
SearchResultEntry resultEntry = resultQueue.poll(500, TimeUnit.MILLISECONDS);
if (resultEntry != null) {
List<String> fields = new ArrayList<>(resultEntry.getFields());
Collections.sort(fields);
System.out.println(Arrays.toString(fields.toArray()));
// List<String> fields = new ArrayList<>(resultEntry.getFields());
// Collections.sort(fields);
// System.out.println(Arrays.toString(fields.toArray()));
System.out.println(resultEntry.getFieldValue("producttype"));
// for (String fieldName : resultEntry.getFields()) {
// System.out.println(fieldName + ": " + resultEntry.getFieldValue(fieldName));
// }
......
......@@ -5,187 +5,84 @@ package de.potsdam.gfz.scihubapi.opensearch;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.util.Iterator;
import java.util.concurrent.BlockingQueue;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.dom4j.Element;
/**
* @author Daniel Eggert (daniel.eggert@gfz-potsdam.de)
*
*/
public class SciHubSearchResponseHandler extends DefaultHandler {
public class SciHubSearchResponseHandler {
private static final String XML_TAG_NAME_ENTRY = "entry";
private static final String XML_TAG_NAME_ENTRY_TITLE = "title";
private static final String XML_TAG_NAME_LINK = "link";
private static final String XML_TAG_NAME_ID = "id";
private static final String XML_TAG_NAME_SUMMARY = "summary";
private static final String XML_TAG_NAME_BOOLEAN = "bool";
private static final String XML_TAG_NAME_INTEGER = "int";
private static final String XML_TAG_NAME_DOUBLE = "double";
private static final String XML_TAG_NAME_STRING = "str";
private static final String XML_TAG_NAME_DATE = "date";
private static final String XML_TAG_NAME_ID = "id";
private static final String XML_TAG_NAME_SUMMARY = "summary";
private static final String XML_TAG_NAME_BOOLEAN = "bool";
private static final String XML_TAG_NAME_INTEGER = "int";
private static final String XML_TAG_NAME_DOUBLE = "double";
private static final String XML_TAG_NAME_STRING = "str";
private static final String XML_TAG_NAME_DATE = "date";
private static final String XML_ATTRIB_NAME_HREF = "href";
private static final String XML_ATTRIB_NAME_REL = "rel";
private static final String XML_ATTRIB_NAME_NAME = "name";
private static final String XML_ATTRIB_VALUE_NEXT = "next";
private final BlockingQueue<SearchResultEntry> resultQueue;
// current tag values
private SearchResultEntry currentEntry = null;
private String currentKey = null;
private String currentType = null;
private String followupLink = null;
/**
* @param elementIterator
* @param resultQueue
*/
public SciHubSearchResponseHandler(BlockingQueue<SearchResultEntry> resultQueue) {
this.resultQueue = resultQueue;
}
public boolean hasFollowupLink() {
return followupLink != null && !followupLink.isEmpty();
}
/**
* Returns the follow-up link of the parsed response. In case no follow-up was found <code>null</code> is returned.
*
* @return
*/
public String getFollowupLink() {
return followupLink;
}
/*
* (non-Javadoc)
*
* @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (currentEntry != null) {
// tag within entry element - set as current key
switch (qName) {
case XML_TAG_NAME_ENTRY_TITLE:
// title tag - store as entry title
currentKey = XML_TAG_NAME_ENTRY_TITLE;
currentType = XML_TAG_NAME_STRING;
break;
case XML_TAG_NAME_LINK:
// link tag - downloadlink and others
String rel = attributes.getValue(XML_ATTRIB_NAME_REL);
String key = XML_TAG_NAME_LINK;
if(rel != null) {
key += "_" + rel;
}
String url = attributes.getValue(XML_ATTRIB_NAME_HREF);
if (url != null) {
currentEntry.addFieldEntry(key, url);
public void extractElements(Iterator<Element> elementIterator, BlockingQueue<SearchResultEntry> resultQueue) {
while (elementIterator.hasNext()) {
Element entry = elementIterator.next();
SearchResultEntry sre = new SearchResultEntry();
for (Iterator<Element> iter = entry.elementIterator(); iter.hasNext();) {
Element e = iter.next();
String content = e.getText();
Object value = null;
String key = e.attributeValue(XML_ATTRIB_NAME_NAME);
try {
switch (e.getName()) {
case XML_TAG_NAME_STRING:
value = content;
break;
case XML_TAG_NAME_BOOLEAN:
value = Boolean.parseBoolean(content);
break;
case XML_TAG_NAME_DATE:
value = DateTimeFormatter.ISO_INSTANT.parse(content, Instant::from);
break;
case XML_TAG_NAME_DOUBLE:
value = Double.parseDouble(content);
break;
case XML_TAG_NAME_INTEGER:
value = Integer.parseInt(content);
break;
case XML_TAG_NAME_SUMMARY:
key = XML_TAG_NAME_SUMMARY;
value = content;
break;
case XML_TAG_NAME_ID:
key = XML_TAG_NAME_ID;
value = content;
break;
}
} catch (Exception ex) {
System.err.println(ex.getMessage());
}
break;
case XML_TAG_NAME_SUMMARY:
// summary string in content
currentKey = XML_TAG_NAME_SUMMARY;
currentType = XML_TAG_NAME_STRING;
break;
case XML_TAG_NAME_ID:
// id string in content
currentKey = XML_TAG_NAME_ID;
currentType = XML_TAG_NAME_STRING;
break;
case XML_TAG_NAME_BOOLEAN:
case XML_TAG_NAME_DATE:
case XML_TAG_NAME_INTEGER:
case XML_TAG_NAME_DOUBLE:
case XML_TAG_NAME_STRING:
// arbitrary typed value - store with attribute name as field
currentKey = attributes.getValue(XML_ATTRIB_NAME_NAME);
currentType = qName;
break;
}
} else {
// non entry top level element
switch (qName) {
case XML_TAG_NAME_ENTRY:
// init new entry element
currentEntry = new SearchResultEntry();
break;
case XML_TAG_NAME_LINK:
// for now we are only interested in follow-up links - so inspect link tags only
// <link rel="next" type="application/atom+xml" href="https://scihub.copernicus.eu/dhus/api/search?q=*&amp;start=10&amp;rows=10"/>
if (XML_ATTRIB_VALUE_NEXT.equals(attributes.getValue(XML_ATTRIB_NAME_REL))) {
// we found a follow-up link - store url in corresponding field
followupLink = attributes.getValue(XML_ATTRIB_NAME_HREF);
if (key != null && value != null) {
sre.addFieldEntry(key, value);
}
break;
}
}
}
/*
* (non-Javadoc)
*
* @see org.xml.sax.helpers.DefaultHandler#characters(char[], int, int)
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (currentKey != null && currentType != null && currentEntry != null) {
String content = new String(ch, start, length);
Object value = null;
try {
switch (currentType) {
case XML_TAG_NAME_STRING:
value = content;
break;
case XML_TAG_NAME_BOOLEAN:
value = Boolean.parseBoolean(content);
break;
case XML_TAG_NAME_DATE:
// 2016-01-13T05:02:45.557Z
value = DateTimeFormatter.ISO_INSTANT.parse(content, Instant::from);
break;
case XML_TAG_NAME_DOUBLE:
value = Double.parseDouble(content);
break;
case XML_TAG_NAME_INTEGER:
value = Integer.parseInt(content);
break;
}
} catch (Exception e) {
System.err.println(e.getMessage());
resultQueue.put(sre);
} catch (InterruptedException e) {
}
if (value != null) {
currentEntry.addFieldEntry(currentKey, value);
}
currentKey = null;
currentType = null;
}
}
/*
* (non-Javadoc)
*
* @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
*/
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
// reset current key value
currentKey = null;
if (XML_TAG_NAME_ENTRY.equals(qName) && currentEntry != null) {
// end of entry element - promote to queue and reset current entry
resultQueue.offer(currentEntry);
currentEntry = null;
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment