app-facebook-event-scraper/app/src/main/java/com/akdev/nofbeventscraper/FbScraper.java

package com.akdev.nofbeventscraper;

import android.content.SharedPreferences;
import android.os.AsyncTask;
import android.util.Log;

import androidx.preference.PreferenceManager;

import java.io.IOException;
import java.lang.ref.WeakReference;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class FbScraper {

    protected List<AsyncTask> tasks;
    protected WeakReference<MainActivity> main; // no context leak with WeakReference
    url_type_enum url_type = url_type_enum.EVENT;
    private String input_url;

    /**
     * Constructor with WeakReference to the main activity, to add events.
     *
     * @param main      WeakReference of main activity to prevent context leak
     * @param input_url Input url to scrape from
     */
    FbScraper(WeakReference<MainActivity> main, String input_url) {
        this.main = main;
        this.input_url = input_url;
        this.tasks = new ArrayList<>();
    }

    protected String getShortened(String url) throws IOException, URISyntaxException {
        // check for url format
        new URL(url).toURI();

        String regex = "(fb.me/)(e/)?([^/?]*)|(facebook.com/event_invite/[a-zA-Z0-9]*)";

        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(url);

        if (matcher.find()) {
            //only mbasic does have event ids displayed in HTML
            String url_prefix = "https://mbasic.";

            // create URL
            return url_prefix + matcher.group();

        } else {
            throw new URISyntaxException(url, "Does not contain page.");
        }

    }

    /**
     * Checks if valid URL,
     * strips the facebook page id from the input link and create an URL that can be scraped from.
     *
     * @param url input URL
     * @return new mbasic url that can be scraped for event id's
     * @throws URISyntaxException    if page not found
     * @throws MalformedURLException
     */
    protected String getPageUrl(String url) throws URISyntaxException, MalformedURLException {

        // check for url format
        new URL(url).toURI();

        String regex = "(facebook.com/)(pg/)?([^/?]*)";

        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(url);

        if (matcher.find()) {
            //only mbasic does have event ids displayed in HTML
            String url_prefix = "https://mbasic.facebook.com/";
            String url_suffix = "?v=events";

            // create URL
            return url_prefix + matcher.group(3) + url_suffix;

        } else {
            throw new URISyntaxException(url, "Does not contain page.");
        }
    }

    /**
     * Strips the facebook event link from the input event url.
     *
     * @param url input url
     * @return facebook event url String if one was found
     * @throws URISyntaxException    if event not found
     * @throws MalformedURLException
     */
    protected String getEventUrl(String url) throws URISyntaxException, MalformedURLException {

        // check for url format
        new URL(url).toURI();

        String regex = "(facebook.com/events/[0-9]*)(/\\?event_time_id=[0-9]*)?";

        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(url);

        if (matcher.find()) {

            String url_prefix = "https://m.";
            if (main != null) {
                SharedPreferences shared_prefs = PreferenceManager.getDefaultSharedPreferences(main.get());
                url_prefix = shared_prefs.getString("url_preference", url_prefix);
            }

            // rewrite url to m.facebook and dismiss any query strings or referrals
            String ret = url_prefix + matcher.group(1);
            if (matcher.group(2) != null) {
                // add event time identifier
                ret += matcher.group(2);
            }
            return ret;
        } else {
            throw new URISyntaxException(url, "Does not contain event.");
        }

    }

    /**
     * cancel vestigial async tasks
     */
    void killAllTasks() {

        if (!tasks.isEmpty()) {
            for (AsyncTask task : tasks) {
                try {
                    task.cancel(true);
                    task = null;
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * start an EventScraper async task and add to tasks list
     *
     * @param event_url
     */
    void scrapeEvent(String event_url) {
        FbEventScraper scraper = new FbEventScraper(this, event_url);

        Log.d("scraperLog", "scrapeEvent: "+event_url);
        tasks.add(scraper);
        scraper.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
    }

    /**
     * Callback for finished EventSCraper async task
     *
     * @param event Contains event information if scraping successful
     * @param error resId for error message
     */
    void scrapeEventResultCallback(FbEvent event, int error) {

        if (event != null) {
            Log.d("scraperLog", "scrapeEventResultCallback: "+event.url);
            main.get().addEvent(event);
            main.get().input_helper(main.get().getString(R.string.done), false);
        } else if (url_type == url_type_enum.EVENT) {
            main.get().input_helper(main.get().getString(error), true);
        }
    }

    /**
     * start a page scraper and add to list of tasks
     *
     * @param page_url
     */
    void scrapePage(String page_url) {
        FbPageScraper scraper = new FbPageScraper(this, page_url);

        Log.d("scraperLog", "scrapePage: "+page_url);

        tasks.add(scraper);
        scraper.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
    }

    /**
     * Callback for page scraper async task
     *
     * @param event_urls List of event urls scraped from the event
     * @param error      resId of error message if task list is empty
     */
    protected void scrapePageResultCallback(List<String> event_urls, int error) {

        if (event_urls.size() > 0) {
            Log.d("scraperLog", "scrapePageResultCallback: "+event_urls.toString());
            for (String event_url : event_urls) {
                try {
                    String url = getEventUrl(event_url);
                    Log.d("scraperLog", "scrapePageResultCallback: "+url);
                    scrapeEvent(url);
                } catch (URISyntaxException | MalformedURLException e) {
                    // ignore this event
                }
            }
        } else {
            main.get().input_helper(main.get().getString(error), true);
        }
    }

    protected void redirectUrl (String url) {
        FbRedirectionResolver resolver = new FbRedirectionResolver(this, url);

        Log.d("scraperLog", "redirectUrl: "+url);

        resolver.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
    }
    protected void redirectionResultCallback(String url) {
        this.input_url = url;

        Log.d("scraperLog", "redirectUrlCb: "+url);

        // now try again with expanded url
        this.run();
    }

    /**
     * Start scraping input url
     */
    void run() {

        // check if shortened url
        try {
            String shortened = getShortened(input_url);
            url_type = url_type_enum.SHORT;
            redirectUrl(shortened);

            return;

        } catch (IOException | URISyntaxException e) {
            url_type = url_type_enum.INVALID;
        }

        // check if input url is an event
        try {
            String event_url = getEventUrl(input_url);
            url_type = url_type_enum.EVENT;
            scrapeEvent(event_url);

            return;

        } catch (URISyntaxException | MalformedURLException e) {
            url_type = url_type_enum.INVALID;
        }
        // check if input url is a page
        try {
            String page_url = getPageUrl(input_url);
            url_type = url_type_enum.PAGE;
            scrapePage(page_url);

            return;

        } catch (URISyntaxException | MalformedURLException e) {
            url_type = url_type_enum.INVALID;
        }
        // check if only page name without prefix
        try {
            String page_url = getPageUrl("https://mbasic.facebook.com/"+input_url);
            url_type = url_type_enum.PAGE;
            scrapePage(page_url);

        } catch (URISyntaxException | MalformedURLException e) {
            url_type = url_type_enum.INVALID;
            main.get().input_helper(main.get().getString(R.string.error_url), true);
        }
    }

    // enum for storing url type in this class
    enum url_type_enum {SHORT, EVENT, PAGE, INVALID}
}