prepare event page scraping, better error/result handling
This commit is contained in:
parent
2750ad86e8
commit
a30756a873
|
@ -0,0 +1,221 @@
|
|||
package com.akdev.nofbeventscraper;
|
||||
|
||||
import android.content.SharedPreferences;
|
||||
import android.os.AsyncTask;
|
||||
|
||||
import androidx.preference.PreferenceManager;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.WeakReference;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static com.akdev.nofbeventscraper.FbEvent.createEventList;
|
||||
|
||||
/**
|
||||
* This class can asynchronously scrape public facebook events
|
||||
* and gather the most important information. It is stored in a FbEvent object.
|
||||
*/
|
||||
public class FbEventScraper extends AsyncTask<Void, Void, Void> {
|
||||
|
||||
private FbScraper scraper;
|
||||
private int error;
|
||||
private String url;
|
||||
private FbEvent event;
|
||||
|
||||
/**
|
||||
* Constructor with WeakReference to the main activity, to update it's text fields.
|
||||
*
|
||||
* @param scraper Reference to FbScraper
|
||||
* @param input_url Input url to scrape from
|
||||
*/
|
||||
FbEventScraper(FbScraper scraper, String input_url) {
|
||||
|
||||
this.scraper = scraper;
|
||||
this.url = input_url;
|
||||
this.error = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips the event location from the json string.
|
||||
* This can be a name only or a complete postal address.
|
||||
*
|
||||
* @param location_json JSON formatted string
|
||||
* @return String representation of the location.
|
||||
*/
|
||||
protected String fixLocation(String location_json) {
|
||||
|
||||
String location_name = "";
|
||||
|
||||
try {
|
||||
JSONObject reader = new JSONObject(location_json);
|
||||
|
||||
location_name = reader.getString("name");
|
||||
JSONObject address = reader.getJSONObject("address");
|
||||
|
||||
String type = address.getString("@type");
|
||||
|
||||
if (type.equals("PostalAddress")) {
|
||||
String postal_code = address.getString("postalCode");
|
||||
String address_locality = address.getString("addressLocality");
|
||||
String street_address = address.getString("streetAddress");
|
||||
// included in locality
|
||||
//String address_country = address.getString("addressCountry");
|
||||
|
||||
return location_name + ", "
|
||||
+ street_address + ", "
|
||||
+ postal_code + " "
|
||||
+ address_locality;
|
||||
} else {
|
||||
return location_name;
|
||||
}
|
||||
|
||||
} catch (JSONException e) {
|
||||
e.printStackTrace();
|
||||
return location_name;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a time string from the facebook event into a Date
|
||||
*
|
||||
* @param time_in time string from the event
|
||||
* @return Date parsed from input or null
|
||||
*/
|
||||
protected Date parseToDate(String time_in) {
|
||||
|
||||
try {
|
||||
// parse e.g. 2011-12-03T10:15:30+0100
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.getDefault());
|
||||
|
||||
return sdf.parse(time_in);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces all occurrences of a facebook internal links in
|
||||
* an event description into an actual URL.
|
||||
*
|
||||
* @param description_in description string from the event
|
||||
* @return corrected String with internal links resolved
|
||||
*/
|
||||
protected String fixDescriptionLinks(String description_in) {
|
||||
try {
|
||||
/* @[152580919265:274:SiteDescription]
|
||||
* to
|
||||
* SiteDescription [m.facebook.com/152580919265] */
|
||||
|
||||
return description_in.replaceAll("@\\[([0-9]{10,}):[0-9]{3}:([^]]*)]",
|
||||
"$2 [m.facebook.com/$1]");
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return description_in;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a single field from a JSONObject
|
||||
*
|
||||
* @param reader JSONObject to read from
|
||||
* @param field Which field to read
|
||||
* @return String of the value of the field or empty string
|
||||
*/
|
||||
private String readFromJson(JSONObject reader, String field) {
|
||||
try {
|
||||
return reader.getString(field);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Started by execute().
|
||||
* Gets the HTML doc from the input string and scrapes the event information from it.
|
||||
*
|
||||
* @param voids
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
protected Void doInBackground(Void... voids) {
|
||||
|
||||
try {
|
||||
// use default android user agent
|
||||
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
|
||||
Document document = Jsoup.connect(url).userAgent(user_agent).get();
|
||||
|
||||
if (document == null) {
|
||||
}
|
||||
String json = document
|
||||
.select("script[type = application/ld+json]")
|
||||
.first().data();
|
||||
|
||||
JSONObject reader = new JSONObject(json);
|
||||
|
||||
|
||||
String name = readFromJson(reader, "name");
|
||||
Date start_date = parseToDate(readFromJson(reader, "startDate"));
|
||||
Date end_date = parseToDate(readFromJson(reader, "endDate"));
|
||||
String description = fixDescriptionLinks(readFromJson(reader, "description"));
|
||||
String location = fixLocation(readFromJson(reader, "location"));
|
||||
|
||||
String image_url = readFromJson(reader, "image"); // get from json
|
||||
|
||||
try {
|
||||
// possibly get higher res image from event header
|
||||
image_url = document.select("div[id=event_header_primary]")
|
||||
.select("img").first().attr("src");
|
||||
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
this.event = new FbEvent(url, name, start_date, end_date, description, location, image_url);
|
||||
|
||||
} catch (JSONException e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_scraping;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_connection;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_unknown;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onPreExecute() {
|
||||
super.onPreExecute();
|
||||
}
|
||||
|
||||
/**
|
||||
* When scraping is finished, the scraper callback will receive the Event.
|
||||
* @param aVoid
|
||||
*/
|
||||
protected void onPostExecute(Void aVoid) {
|
||||
super.onPostExecute(aVoid);
|
||||
|
||||
this.scraper.scrapeEventResultCallback(this.event, this.error);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,42 +1,29 @@
|
|||
package com.akdev.nofbeventscraper;
|
||||
|
||||
import android.content.SharedPreferences;
|
||||
import android.os.AsyncTask;
|
||||
|
||||
import androidx.preference.PreferenceManager;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.WeakReference;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static com.akdev.nofbeventscraper.FbEvent.createEventList;
|
||||
|
||||
/**
|
||||
* This class can asynchronously scrape public facebook events
|
||||
* and gather the most important information. It is stored in a FbEvent object.
|
||||
*/
|
||||
public class FbScraper extends AsyncTask<Void, Void, Void> {
|
||||
public class FbScraper {
|
||||
|
||||
protected List<FbEvent> events;
|
||||
url_type_enum url_type = url_type_enum.EVENT;
|
||||
private int error;
|
||||
private String input_url;
|
||||
private WeakReference<MainActivity> main; // no context leak with WeakReference
|
||||
private List<FbEvent> events;
|
||||
|
||||
/**
|
||||
* Constructor with WeakReference to the main activity, to update it's text fields.
|
||||
* Constructor with WeakReference to the main activity, to add events.
|
||||
*
|
||||
* @param main WeakReference of main activity to prevent context leak
|
||||
* @param input_url Input url to scrape from
|
||||
|
@ -45,6 +32,12 @@ public class FbScraper extends AsyncTask<Void, Void, Void> {
|
|||
this.main = main;
|
||||
this.input_url = input_url;
|
||||
this.events = createEventList();
|
||||
|
||||
run();
|
||||
}
|
||||
|
||||
protected String getPageUrl(String url) throws URISyntaxException, MalformedURLException {
|
||||
throw new URISyntaxException(url, "not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -55,7 +48,7 @@ public class FbScraper extends AsyncTask<Void, Void, Void> {
|
|||
* @throws URISyntaxException if event not found
|
||||
* @throws MalformedURLException
|
||||
*/
|
||||
protected String fixURI(String url) throws URISyntaxException, MalformedURLException {
|
||||
protected String getEventUrl(String url) throws URISyntaxException, MalformedURLException {
|
||||
|
||||
// check for url format
|
||||
new URL(url).toURI();
|
||||
|
@ -76,6 +69,7 @@ public class FbScraper extends AsyncTask<Void, Void, Void> {
|
|||
// rewrite url to m.facebook and dismiss any query strings or referrals
|
||||
String ret = url_prefix + matcher.group(1);
|
||||
if (matcher.group(2) != null) {
|
||||
// add event time identifier
|
||||
ret += matcher.group(2);
|
||||
}
|
||||
return ret;
|
||||
|
@ -85,188 +79,68 @@ public class FbScraper extends AsyncTask<Void, Void, Void> {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips the event location from the json string.
|
||||
* This can be a name only or a complete postal address.
|
||||
*
|
||||
* @param location_json JSON formatted string
|
||||
* @return String representation of the location.
|
||||
*/
|
||||
protected String fixLocation(String location_json) {
|
||||
void scrapeEvent(String event_url) {
|
||||
FbEventScraper scraper = new FbEventScraper(this, event_url);
|
||||
scraper.execute();
|
||||
}
|
||||
|
||||
String location_name = "";
|
||||
void scrapeEventResultCallback(FbEvent event, int error) {
|
||||
|
||||
try {
|
||||
JSONObject reader = new JSONObject(location_json);
|
||||
|
||||
location_name = reader.getString("name");
|
||||
JSONObject address = reader.getJSONObject("address");
|
||||
|
||||
String type = address.getString("@type");
|
||||
|
||||
if (type.equals("PostalAddress")) {
|
||||
String postal_code = address.getString("postalCode");
|
||||
String address_locality = address.getString("addressLocality");
|
||||
String street_address = address.getString("streetAddress");
|
||||
// included in locality
|
||||
//String address_country = address.getString("addressCountry");
|
||||
|
||||
return location_name + ", "
|
||||
+ street_address + ", "
|
||||
+ postal_code + " "
|
||||
+ address_locality;
|
||||
if (url_type == url_type_enum.EVENT) {
|
||||
if (event != null) {
|
||||
main.get().addEvent(event);
|
||||
main.get().input_helper(R.string.done, false);
|
||||
} else {
|
||||
return location_name;
|
||||
main.get().input_helper(error, true);
|
||||
}
|
||||
|
||||
} catch (JSONException e) {
|
||||
e.printStackTrace();
|
||||
return location_name;
|
||||
} else {
|
||||
main.get().addEvent(event);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a time string from the facebook event into a Date
|
||||
*
|
||||
* @param time_in time string from the event
|
||||
* @return Date parsed from input or null
|
||||
*/
|
||||
protected Date parseToDate(String time_in) {
|
||||
void scrapePage(String page_url) {
|
||||
/*
|
||||
FbPageScraper scraper = new FbPageScraper(this, page_url);
|
||||
|
||||
try {
|
||||
// parse e.g. 2011-12-03T10:15:30+0100
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.getDefault());
|
||||
|
||||
return sdf.parse(time_in);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
scraper.execute();
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces all occurrences of a facebook internal links in
|
||||
* an event description into an actual URL.
|
||||
*
|
||||
* @param description_in description string from the event
|
||||
* @return corrected String with internal links resolved
|
||||
*/
|
||||
protected String fixDescriptionLinks(String description_in) {
|
||||
try {
|
||||
/* @[152580919265:274:SiteDescription]
|
||||
* to
|
||||
* SiteDescription [m.facebook.com/152580919265] */
|
||||
protected void scrapePageResultCallback(String[] event_urls, int error) {
|
||||
|
||||
return description_in.replaceAll("@\\[([0-9]{10,}):[0-9]{3}:([^]]*)]",
|
||||
"$2 [m.facebook.com/$1]");
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return description_in;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a single field from a JSONObject
|
||||
*
|
||||
* @param reader JSONObject to read from
|
||||
* @param field Which field to read
|
||||
* @return String of the value of the field or empty string
|
||||
*/
|
||||
private String readFromJson(JSONObject reader, String field) {
|
||||
try {
|
||||
return reader.getString(field);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Started by scraper.execute().
|
||||
* Gets the HTML doc from the input string and scrapes the event information from it.
|
||||
*
|
||||
* @param voids
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
protected Void doInBackground(Void... voids) {
|
||||
|
||||
try {
|
||||
String url = fixURI(input_url);
|
||||
// use default android user agent
|
||||
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
|
||||
Document document = Jsoup.connect(url).userAgent(user_agent).get();
|
||||
|
||||
if (document == null) {
|
||||
if (event_urls != null) {
|
||||
for (String event_url : event_urls) {
|
||||
scrapeEvent(event_url);
|
||||
}
|
||||
String json = document
|
||||
.select("script[type = application/ld+json]")
|
||||
.first().data();
|
||||
} else if (url_type == url_type_enum.PAGE) {
|
||||
main.get().input_helper(error, true);
|
||||
}
|
||||
}
|
||||
|
||||
JSONObject reader = new JSONObject(json);
|
||||
void run() {
|
||||
|
||||
try {
|
||||
String event_url = getEventUrl(input_url);
|
||||
url_type = url_type_enum.EVENT;
|
||||
scrapeEvent(event_url);
|
||||
|
||||
String name = readFromJson(reader, "name");
|
||||
Date start_date = parseToDate(readFromJson(reader, "startDate"));
|
||||
Date end_date = parseToDate(readFromJson(reader, "endDate"));
|
||||
String description = fixDescriptionLinks(readFromJson(reader, "description"));
|
||||
String location = fixLocation(readFromJson(reader, "location"));
|
||||
|
||||
String image_url = readFromJson(reader, "image"); // get from json
|
||||
|
||||
try {
|
||||
// possibly get higher res image from event header
|
||||
image_url = document.select("div[id=event_header_primary]")
|
||||
.select("img").first().attr("src");
|
||||
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
FbEvent event = new FbEvent(url, name, start_date, end_date, description, location, image_url);
|
||||
this.events.add(event);
|
||||
this.events.add(new FbEvent());
|
||||
return;
|
||||
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_url;
|
||||
} catch (JSONException e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_scraping;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_connection;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_unknown;
|
||||
url_type = url_type_enum.INVALID;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
String page_url = getPageUrl(input_url);
|
||||
url_type = url_type_enum.PAGE;
|
||||
scrapePage(page_url);
|
||||
|
||||
@Override
|
||||
protected void onPreExecute() {
|
||||
super.onPreExecute();
|
||||
}
|
||||
|
||||
/**
|
||||
* When scraping is finished, main activity will be updated.
|
||||
* If an error occurred, main activity is given an error string.
|
||||
*
|
||||
* @param aVoid
|
||||
*/
|
||||
protected void onPostExecute(Void aVoid) {
|
||||
super.onPostExecute(aVoid);
|
||||
|
||||
if (main != null) {
|
||||
if (! this.events.isEmpty()) {
|
||||
main.get().addEvents(this.events);
|
||||
} else {
|
||||
main.get().error(error);
|
||||
}
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
url_type = url_type_enum.INVALID;
|
||||
main.get().input_helper(R.string.error_url, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
enum url_type_enum {EVENT, PAGE, INVALID}
|
||||
}
|
|
@ -158,7 +158,7 @@ public class MainActivity extends AppCompatActivity {
|
|||
startScraping();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
error(R.string.error_clipboard_empty);
|
||||
input_helper(R.string.error_clipboard_empty, true);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -166,13 +166,16 @@ public class MainActivity extends AppCompatActivity {
|
|||
/*
|
||||
* Error in input: clear input on click
|
||||
*/
|
||||
layout_uri_input.setErrorIconOnClickListener(new View.OnClickListener() {
|
||||
View.OnClickListener listener = new View.OnClickListener() {
|
||||
@Override
|
||||
public void onClick(View view) {
|
||||
layout_uri_input.setError(null);
|
||||
input_helper(R.string.helper_add_link, true);
|
||||
edit_text_uri_input.setText(null);
|
||||
input_helper(R.string.helper_add_link, false);
|
||||
}
|
||||
});
|
||||
};
|
||||
layout_uri_input.setErrorIconOnClickListener(listener);
|
||||
layout_uri_input.setEndIconOnClickListener(listener);
|
||||
|
||||
|
||||
/*
|
||||
|
@ -213,30 +216,35 @@ public class MainActivity extends AppCompatActivity {
|
|||
*/
|
||||
public void startScraping() {
|
||||
|
||||
error(null);
|
||||
input_helper(null, false);
|
||||
|
||||
String url = Objects.requireNonNull(edit_text_uri_input.getText()).toString();
|
||||
|
||||
scraper = new FbScraper(new WeakReference<>(this), url);
|
||||
scraper.execute();
|
||||
}
|
||||
|
||||
public void error(Integer resId) {
|
||||
if (resId != null) {
|
||||
layout_uri_input.setError(getString(resId));
|
||||
} else {
|
||||
public void input_helper(Integer resId, boolean error) {
|
||||
|
||||
String str = (resId != null) ? getString(resId) : " ";
|
||||
|
||||
if (error) {
|
||||
layout_uri_input.setError(str);
|
||||
}
|
||||
else {
|
||||
layout_uri_input.setError(null);
|
||||
layout_uri_input.setHelperText(str);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds new events to the start of the events list.
|
||||
*
|
||||
* @param new_events the list of events that was scraped by FbScraper
|
||||
* @param new_event the event that was scraped by FbScraper
|
||||
*/
|
||||
public void addEvents(List<FbEvent> new_events) {
|
||||
public void addEvent(FbEvent new_event) {
|
||||
|
||||
if (new_events != null) {
|
||||
this.events.addAll(0, new_events);
|
||||
if (new_event != null) {
|
||||
this.events.add(0, new_event);
|
||||
this.adapter.notifyDataSetChanged();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,4 +18,5 @@
|
|||
<string name="preferences_events_header">Veranstaltungen</string>
|
||||
<string name="preferences_event_setting">Veranstaltungsliste löschen</string>
|
||||
<string name="preferences_event_snackbar">"Veranstaltungen gelöscht "</string>
|
||||
<string name="done">Fertig</string>
|
||||
</resources>
|
|
@ -30,5 +30,6 @@
|
|||
<string name="preferences_event_setting">Clear event list</string>
|
||||
<string name="preferences_event_snackbar">Events list cleared</string>
|
||||
<string name="event_placeholder" translatable="false">Placeholder</string>
|
||||
<string name="done">Done</string>
|
||||
|
||||
</resources>
|
||||
|
|
Loading…
Reference in New Issue