package com.akdev.nofbeventscraper; import android.content.SharedPreferences; import android.os.AsyncTask; import androidx.preference.PreferenceManager; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; /** * This class can asynchronously scrape public facebook pages for event ids * It returns a String list of event urls */ public class FbPageScraper extends AsyncTask { private FbScraper scraper; private int error; private String url; private List event_links = new ArrayList(); /** * Constructor with reference to scraper to return results. * * @param scraper Reference to FbScraper * @param page_url Input url to scrape from */ FbPageScraper(FbScraper scraper, String page_url) { this.scraper = scraper; this.url = page_url; this.error = 0; } /** * Started by execute(). * Gets the HTML doc from the input string and scrapes the event links from it. * * @param voids * @return */ @Override protected Void doInBackground(Void... voids) { do { try { // use default android user agent String user_agent = "Mozilla/5.0 (X11; Linux x86_64)"; Document document = Jsoup.connect(url).userAgent(user_agent).get(); if (document == null) { throw new IOException(); } String regex = "(/events/[0-9]*)(/\\?event_time_id=[0-9]*)?"; List event_links_href = document .getElementsByAttributeValueMatching("href", Pattern.compile(regex)) .eachAttr("href"); for (String link : event_links_href) { this.event_links.add("https://www.facebook.com" + link); } SharedPreferences shared_prefs = PreferenceManager .getDefaultSharedPreferences(scraper.main.get()); int max = shared_prefs.getInt("page_event_max", 5); if (event_links.size() < max) { try { String next_url = document .getElementsByAttributeValueMatching("href", "has_more=1") .first().attr("href"); this.url = "https://mbasic.facebook.com" + next_url; } catch (NullPointerException e) { url = null; event_links = event_links.subList(0, max); } } else { url = null; event_links = event_links.subList(0, max); } } catch (IOException e) { e.printStackTrace(); this.error = R.string.error_connection; } catch (Exception e) { e.printStackTrace(); this.error = R.string.error_unknown; } } while (url != null); return null; } @Override protected void onPreExecute() { super.onPreExecute(); } /** * When scraping is finished, the scraper callback will receive the link list. * * @param aVoid */ protected void onPostExecute(Void aVoid) { super.onPostExecute(aVoid); this.scraper.scrapePageResultCallback(this.event_links, this.error); } }