app-facebook-event-scraper/app/src/main/java/com/akdev/nofbeventscraper/FbScraper.java

286 lines
8.7 KiB
Java

package com.akdev.nofbeventscraper;
import android.content.SharedPreferences;
import android.os.AsyncTask;
import android.util.Log;
import androidx.preference.PreferenceManager;
import java.io.IOException;
import java.lang.ref.WeakReference;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FbScraper {
protected List<AsyncTask> tasks;
protected WeakReference<MainActivity> main; // no context leak with WeakReference
url_type_enum url_type = url_type_enum.EVENT;
private String input_url;
/**
* Constructor with WeakReference to the main activity, to add events.
*
* @param main WeakReference of main activity to prevent context leak
* @param input_url Input url to scrape from
*/
FbScraper(WeakReference<MainActivity> main, String input_url) {
this.main = main;
this.input_url = input_url;
this.tasks = new ArrayList<>();
}
protected String getShortened(String url) throws IOException, URISyntaxException {
// check for url format
new URL(url).toURI();
String regex = "(fb.me/)(e/)?([^/?]*)|(facebook.com/event_invite/[a-zA-Z0-9]*)";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(url);
if (matcher.find()) {
//only mbasic does have event ids displayed in HTML
String url_prefix = "https://mbasic.";
// create URL
return url_prefix + matcher.group();
} else {
throw new URISyntaxException(url, "Does not contain page.");
}
}
/**
* Checks if valid URL,
* strips the facebook page id from the input link and create an URL that can be scraped from.
*
* @param url input URL
* @return new mbasic url that can be scraped for event id's
* @throws URISyntaxException if page not found
* @throws MalformedURLException
*/
protected String getPageUrl(String url) throws URISyntaxException, MalformedURLException {
// check for url format
new URL(url).toURI();
String regex = "(facebook.com/)(pg/)?([^/?]*)";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(url);
if (matcher.find()) {
//only mbasic does have event ids displayed in HTML
String url_prefix = "https://mbasic.facebook.com/";
String url_suffix = "?v=events";
// create URL
return url_prefix + matcher.group(3) + url_suffix;
} else {
throw new URISyntaxException(url, "Does not contain page.");
}
}
/**
* Strips the facebook event link from the input event url.
*
* @param url input url
* @return facebook event url String if one was found
* @throws URISyntaxException if event not found
* @throws MalformedURLException
*/
protected String getEventUrl(String url) throws URISyntaxException, MalformedURLException {
// check for url format
new URL(url).toURI();
String regex = "(facebook.com/events/[0-9]*)(/\\?event_time_id=[0-9]*)?";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(url);
if (matcher.find()) {
String url_prefix = "https://m.";
if (main != null) {
SharedPreferences shared_prefs = PreferenceManager.getDefaultSharedPreferences(main.get());
url_prefix = shared_prefs.getString("url_preference", url_prefix);
}
// rewrite url to m.facebook and dismiss any query strings or referrals
String ret = url_prefix + matcher.group(1);
if (matcher.group(2) != null) {
// add event time identifier
ret += matcher.group(2);
}
return ret;
} else {
throw new URISyntaxException(url, "Does not contain event.");
}
}
/**
* cancel vestigial async tasks
*/
void killAllTasks() {
if (!tasks.isEmpty()) {
for (AsyncTask task : tasks) {
try {
task.cancel(true);
task = null;
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
/**
* start an EventScraper async task and add to tasks list
*
* @param event_url
*/
void scrapeEvent(String event_url) {
FbEventScraper scraper = new FbEventScraper(this, event_url);
Log.d("scraperLog", "scrapeEvent: "+event_url);
tasks.add(scraper);
scraper.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
}
/**
* Callback for finished EventSCraper async task
*
* @param event Contains event information if scraping successful
* @param error resId for error message
*/
void scrapeEventResultCallback(FbEvent event, int error) {
if (event != null) {
Log.d("scraperLog", "scrapeEventResultCallback: "+event.url);
main.get().addEvent(event);
main.get().input_helper(main.get().getString(R.string.done), false);
} else if (url_type == url_type_enum.EVENT) {
main.get().input_helper(main.get().getString(error), true);
}
}
/**
* start a page scraper and add to list of tasks
*
* @param page_url
*/
void scrapePage(String page_url) {
FbPageScraper scraper = new FbPageScraper(this, page_url);
Log.d("scraperLog", "scrapePage: "+page_url);
tasks.add(scraper);
scraper.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
}
/**
* Callback for page scraper async task
*
* @param event_urls List of event urls scraped from the event
* @param error resId of error message if task list is empty
*/
protected void scrapePageResultCallback(List<String> event_urls, int error) {
if (event_urls.size() > 0) {
Log.d("scraperLog", "scrapePageResultCallback: "+event_urls.toString());
for (String event_url : event_urls) {
try {
String url = getEventUrl(event_url);
Log.d("scraperLog", "scrapePageResultCallback: "+url);
scrapeEvent(url);
} catch (URISyntaxException | MalformedURLException e) {
// ignore this event
}
}
} else {
main.get().input_helper(main.get().getString(error), true);
}
}
protected void redirectUrl (String url) {
FbRedirectionResolver resolver = new FbRedirectionResolver(this, url);
Log.d("scraperLog", "redirectUrl: "+url);
resolver.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
}
protected void redirectionResultCallback(String url) {
this.input_url = url;
Log.d("scraperLog", "redirectUrlCb: "+url);
// now try again with expanded url
this.run();
}
/**
* Start scraping input url
*/
void run() {
// check if shortened url
try {
String shortened = getShortened(input_url);
url_type = url_type_enum.SHORT;
redirectUrl(shortened);
return;
} catch (IOException | URISyntaxException e) {
url_type = url_type_enum.INVALID;
}
// check if input url is an event
try {
String event_url = getEventUrl(input_url);
url_type = url_type_enum.EVENT;
scrapeEvent(event_url);
return;
} catch (URISyntaxException | MalformedURLException e) {
url_type = url_type_enum.INVALID;
}
// check if input url is a page
try {
String page_url = getPageUrl(input_url);
url_type = url_type_enum.PAGE;
scrapePage(page_url);
return;
} catch (URISyntaxException | MalformedURLException e) {
url_type = url_type_enum.INVALID;
}
// check if only page name without prefix
try {
String page_url = getPageUrl("https://mbasic.facebook.com/"+input_url);
url_type = url_type_enum.PAGE;
scrapePage(page_url);
} catch (URISyntaxException | MalformedURLException e) {
url_type = url_type_enum.INVALID;
main.get().input_helper(main.get().getString(R.string.error_url), true);
}
}
// enum for storing url type in this class
enum url_type_enum {SHORT, EVENT, PAGE, INVALID}
}