mirror of
https://github.com/akaessens/NoFbEventScraper
synced 2025-06-05 23:29:13 +02:00
much refactoring:
-move event formatting logic to event class -disable editing of event output, it's available in the calendar app -replace string datetimes with ZonedDateZime -move uri checking logic to scraper -update exception handling and error messages -reformatting and renaming -fix messy xml layouts -update tests -add comments
This commit is contained in:
@ -1,6 +1,5 @@
|
||||
package com.akdev.nofbeventscraper;
|
||||
|
||||
|
||||
import android.os.AsyncTask;
|
||||
import android.text.Editable;
|
||||
import android.text.SpannableStringBuilder;
|
||||
@ -11,69 +10,100 @@ import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.WeakReference;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class FbScraper extends AsyncTask<Void, Void, Void> {
|
||||
|
||||
private String url;
|
||||
private String error;
|
||||
private MainActivity main;
|
||||
private String input_str;
|
||||
private WeakReference<MainActivity> main; // no context leak with WeakReference
|
||||
private FbEvent event;
|
||||
|
||||
FbScraper(MainActivity main, String url) {
|
||||
this.url = url;
|
||||
FbScraper(WeakReference<MainActivity> main, String str) {
|
||||
this.main = main;
|
||||
this.input_str = str;
|
||||
}
|
||||
|
||||
protected String fixURI(String str) throws URISyntaxException, MalformedURLException {
|
||||
|
||||
// check for url format
|
||||
new URL(str).toURI();
|
||||
|
||||
Pattern pattern = Pattern.compile("(facebook.com/events/[0-9]*)");
|
||||
Matcher matcher = pattern.matcher(str);
|
||||
|
||||
if (matcher.find()) {
|
||||
// rewrite url to m.facebook and dismiss any query strings or referrals
|
||||
return "https://m." + matcher.group(1);
|
||||
} else {
|
||||
throw new URISyntaxException(str, "Does not contain event.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected String fixLocation(String location_json) {
|
||||
|
||||
String name = "";
|
||||
String location_name = "";
|
||||
|
||||
try {
|
||||
JSONObject reader = new JSONObject(location_json);
|
||||
|
||||
name = reader.getString("name");
|
||||
location_name = reader.getString("name");
|
||||
JSONObject address = reader.getJSONObject("address");
|
||||
|
||||
String type = address.getString("@type");
|
||||
|
||||
if (type.equals("PostalAddress"))
|
||||
{
|
||||
if (type.equals("PostalAddress")) {
|
||||
String postal_code = address.getString("postalCode");
|
||||
String address_locality = address.getString("addressLocality");
|
||||
String address_country = address.getString("addressCountry");
|
||||
String street_address = address.getString("streetAddress");
|
||||
// included in locality
|
||||
//String address_country = address.getString("addressCountry");
|
||||
|
||||
return name + ", " + street_address + ", " + postal_code + " " + address_locality;
|
||||
}
|
||||
else
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
return location_name + ", "
|
||||
+ street_address + ", "
|
||||
+ postal_code + " "
|
||||
+ address_locality;
|
||||
} else {
|
||||
return location_name;
|
||||
}
|
||||
|
||||
} catch (JSONException e) {
|
||||
e.printStackTrace();
|
||||
return name;
|
||||
return location_name;
|
||||
}
|
||||
}
|
||||
|
||||
protected String fixTimezone(String time_in) {
|
||||
protected ZonedDateTime toZonedDateTime(String time_in) {
|
||||
|
||||
try {
|
||||
|
||||
// time in is missing a : in the timezone offset
|
||||
Editable editable = new SpannableStringBuilder(time_in);
|
||||
String time_str = editable.insert(22, ":").toString();
|
||||
|
||||
return editable.insert(22, ":").toString();
|
||||
// parse e.g. 2011-12-03T10:15:30+01:00
|
||||
return ZonedDateTime.parse(time_str, DateTimeFormatter.ISO_OFFSET_DATE_TIME);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return "";
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
protected String fixLinks(String description_in) {
|
||||
protected String fixDescriptionLinks(String description_in) {
|
||||
try {
|
||||
// @[152580919265:274:MagentaMusik 360] -> m.facebook.com/152580919265
|
||||
/* @[152580919265:274:SiteDescription]
|
||||
* to
|
||||
* SiteDescription [m.facebook.com/152580919265] */
|
||||
|
||||
return description_in.replaceAll("@\\[([0-9]{10,}):[0-9]{3}:([^]]*)]",
|
||||
"$2 [m.facebook.com/$1]");
|
||||
|
||||
@ -86,60 +116,45 @@ public class FbScraper extends AsyncTask<Void, Void, Void> {
|
||||
private String readFromJson(JSONObject reader, String field) {
|
||||
try {
|
||||
return reader.getString(field);
|
||||
}
|
||||
catch (Exception e) {
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Void doInBackground(Void... voids) {
|
||||
|
||||
Document document = null;
|
||||
|
||||
try {
|
||||
document = Jsoup.connect(url).userAgent("Mozilla").get();
|
||||
String url = fixURI(input_str);
|
||||
// useragent needed with Jsoup > 1.12
|
||||
Document document = Jsoup.connect(url).userAgent("Mozilla").get();
|
||||
String json = document
|
||||
.select("script[type = application/ld+json]")
|
||||
.first().data();
|
||||
|
||||
try {
|
||||
String json = document.select("script[type = application/ld+json]").first().data();
|
||||
JSONObject reader = new JSONObject(json);
|
||||
|
||||
JSONObject reader = new JSONObject(json);
|
||||
event = new FbEvent();
|
||||
event.url = url;
|
||||
event.name = readFromJson(reader, "name");
|
||||
event.start_date = toZonedDateTime(readFromJson(reader, "startDate"));
|
||||
event.end_date = toZonedDateTime(readFromJson(reader, "endDate"));
|
||||
event.description = fixDescriptionLinks(readFromJson(reader, "description"));
|
||||
event.location = fixLocation(readFromJson(reader, "location"));
|
||||
event.image_url = readFromJson(reader, "image");
|
||||
|
||||
String event_name = readFromJson(reader, "name");
|
||||
String event_start = fixTimezone(readFromJson(reader, "startDate"));
|
||||
String event_end = fixTimezone(readFromJson(reader, "endDate"));
|
||||
|
||||
String event_description = fixLinks(readFromJson(reader, "description"));
|
||||
String location = fixLocation(readFromJson(reader, "location"));
|
||||
|
||||
String image_url = "";
|
||||
|
||||
try {
|
||||
image_url = readFromJson(reader, "image"); // get from json
|
||||
|
||||
// get from event header
|
||||
image_url = document.getElementsByClass("scaledImageFitWidth").first().attr("src");
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
this.error = "Error: no image found";
|
||||
}
|
||||
|
||||
if (event_name == null) {
|
||||
this.event = null;
|
||||
throw new Exception();
|
||||
} else {
|
||||
this.event = new FbEvent(event_name, event_start, event_end, event_description, location, image_url);
|
||||
//this.event = new FbEvent("", "", "", "", "", "");
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
this.error = "Error: Scraping event data failed";
|
||||
}
|
||||
} catch (Exception e) {
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
this.error = "Error: URL not available";
|
||||
this.error = "Error: URL invalid.";
|
||||
} catch (JSONException e) {
|
||||
e.printStackTrace();
|
||||
this.error = "Error: Scraping event data failed";
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
this.error = "Error: Unable to connect.";
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -152,11 +167,10 @@ public class FbScraper extends AsyncTask<Void, Void, Void> {
|
||||
super.onPostExecute(aVoid);
|
||||
|
||||
if (this.event != null) {
|
||||
this.main.update(event);
|
||||
}
|
||||
else {
|
||||
main.error(error);
|
||||
this.main.clear(false);
|
||||
main.get().update(event);
|
||||
} else {
|
||||
main.get().error(error);
|
||||
main.get().clear(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user