Commit 98d72a3a authored by Volker Krause's avatar Volker Krause
Browse files

Increase robustness against different HTML to text conversions

This is a preparational step for improvements to the layout of texts
produced from HTML input and makes those extractor scripts rely less
on specific whitespace sequences.
parent b997cc41
Pipeline #273471 passed with stage
in 13 minutes and 57 seconds
......@@ -16,7 +16,7 @@ function parseHtml(doc) {
var hotelInfo = elems[3].eval(".//table//table");
var row = hotelInfo[0].firstChild;
var addr = row.recursiveContent.match(/([^\n]+)\n\s+([^\n]+)\n\s+([^\n]+)/);
var addr = row.recursiveContent.match(/([^\n]+)[\n\s]+([^\n]+)\n\s*([^\n]+)/);
res.reservationFor.name = addr[1];
res.reservationFor.address.streetAddress = addr[2];
res.reservationFor.address.addressLocality = addr[3];
......
......@@ -8,10 +8,10 @@ function main(content) {
const resId = content.match(/Buchungsnummer:\n?\s*(.+)/)[1]
const arrivalDate = content.match(/Anreisetag:\n? (.+)/)[1]
const departureDate = content.match(/Abreisetag:\n? (.+)/)[1]
const arrivalDate = content.match(/Anreisetag:\n? *(.+)/)[1]
const departureDate = content.match(/Abreisetag:\n? *(.+)/)[1]
const guestName = content.match(/Gastname:\n? (?:Herr|Frau)? ?(.+)/)[1]
const guestName = content.match(/Gastname:\n? *(?:Herr|Frau)? ?(.+)/)[1]
const addressBlock = content.match(/Ihr\n(.*)\n(.*)\n(.*)/)
......@@ -28,13 +28,13 @@ function main(content) {
address.streetAddress = street
const telephone = content.match(/Tel.: (.*)/)[1]
const email = content.match(/E-Mail: (.*)/)[1]
const email = content.match(/E-Mail: +(.*)/)[1]
const price = content.match(/Gesamtpreis:\n? (.*) EUR/)[1].replace(',', '.')
const price = content.match(/Gesamtpreis:\n? *(.*) EUR/)[1].replace(',', '.')
const numberAdults = content.match(/Anzahl der Erwachsene[rn]:\n? ([0-9]+)/)[1]
const numberAdults = content.match(/Anzahl der Erwachsene[rn]:\n? *([0-9]+)/)[1]
const numberChildren = content.match(/Anzahl der Kinder:\n? ([0-9]+)/)[1]
const numberChildren = content.match(/Anzahl der Kinder:\n? *([0-9]+)/)[1]
var res = JsonLd.newLodgingReservation()
......
......@@ -18,7 +18,7 @@ function parseHtml(doc)
res.reservationFor.geo.latitude = text.match(/Latitude:\s(-?\d+.\d+)/)[1] * 1.0;
res.reservationFor.geo.longitude = text.match(/Longitude:\s(-?\d+.\d+)/)[1] * 1.0;
var addr = text.match(/Lodging information[\n\s]+(.*?)\n[\n\s]+(.*?)\n[\n\s]+(.*?)\n[\n\s]+(?:Telep|P)hone: (.*?)\n.*\n*\s+Email:\s+(.*?)\n\s+Internet:\s+(.*?)\n/);
var addr = text.match(/Lodging information[\n\s]+(.*?)\n[\n\s]*(.*?)\n[\n\s]*(.*?)\n[\n\s]*(?:Telep|P)hone: (.*?)\n.*\n*\s*Email:\s+(.*?)\n\s*Internet:\s+(.*?)\n/);
res.reservationFor.address.streetAddress = addr[2];
res.reservationFor.address.addressLocality = addr[3];
res.reservationFor.telephone = addr[4];
......
......@@ -475,7 +475,7 @@ function parseOuigoConfirmation(html) {
if (!date) {
break;
}
const leg = text.substr(idx).match(/(\d{2}h\d{2})\s+(.*?)\n\s+(\d{2}h\d{2})\s+(.*?)\n\s+TRAIN N° *(.*)\n/);
const leg = text.substr(idx).match(/(\d{2}h\d{2})\s+(.*?)\n\s*(\d{2}h\d{2})\s*(.*?)\n\s*TRAIN N° *(.*)\n/);
var res = JsonLd.newTrainReservation();
res.reservationNumber = refNum;
res.reservationFor.departureTime = JsonLd.toDateTime(date[1] + leg[1], "d MMMM yyyyhh'h'mm", "fr");
......
......@@ -34,7 +34,7 @@ function parseReservation(html, node) {
res.reservationNumber = ref[2];
var schedule = html.eval('//table[@class="schedule"]')[0].eval(".//tr");
var stations = schedule[1].recursiveContent.match(/(.*)\n.*\n(.*)/);
var stations = schedule[1].recursiveContent.match(/(.*)[\n\s]+\d{2}:\d{2}[\n\s](.*)/);
res.reservationFor.departureStation.name = stations[1];
res.reservationFor.arrivalStation.name = stations[2];
......@@ -43,7 +43,7 @@ function parseReservation(html, node) {
res.reservationFor.arrivalTime = JsonLd.toDateTime(ref[1] + times[2], "dd/MM/yyyyhh:mm", "en");
var detailsElem = html.eval('//table[@class="detailtrain"]')[0];
var details = detailsElem.recursiveContent.match(/(\d{4})\n[\s\S]*?(\d{1})\n[\s\S]*?(\d{1,2})\n[\s\S]*?(\d{1,3})/);
var details = detailsElem.recursiveContent.match(/(\d{4})[\n\s][\s\S]*?(\d{1})[\n\s][\s\S]*?(\d{1,2})[\n\s][\s\S]*?(\d{1,3})/);
res.reservationFor.trainNumber = "THA " + details[1];
res.reservedTicket.ticketedSeat.seatingType = details[2];
res.reservedTicket.ticketedSeat.seatSection = details[3];
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment