124 lines
3.5 KiB
Java
124 lines
3.5 KiB
Java
package com.akdev.nofbeventscraper;
|
|
|
|
import android.content.SharedPreferences;
|
|
import android.os.AsyncTask;
|
|
|
|
import androidx.preference.PreferenceManager;
|
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Pattern;
|
|
|
|
/**
|
|
* This class can asynchronously scrape public facebook pages for event ids
|
|
* It returns a String list of event urls
|
|
*/
|
|
public class FbPageScraper extends AsyncTask<Void, Void, Void> {
|
|
|
|
private FbScraper scraper;
|
|
private int error;
|
|
private String url;
|
|
private List<String> event_links = new ArrayList<String>();
|
|
|
|
/**
|
|
* Constructor with reference to scraper to return results.
|
|
*
|
|
* @param scraper Reference to FbScraper
|
|
* @param page_url Input url to scrape from
|
|
*/
|
|
FbPageScraper(FbScraper scraper, String page_url) {
|
|
|
|
this.scraper = scraper;
|
|
this.url = page_url;
|
|
this.error = 0;
|
|
}
|
|
|
|
/**
|
|
* Started by execute().
|
|
* Gets the HTML doc from the input string and scrapes the event links from it.
|
|
*
|
|
* @param voids
|
|
* @return
|
|
*/
|
|
@Override
|
|
protected Void doInBackground(Void... voids) {
|
|
|
|
|
|
do {
|
|
try {
|
|
// use default android user agent
|
|
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
|
|
Document document = Jsoup.connect(url).userAgent(user_agent).get();
|
|
|
|
if (document == null) {
|
|
throw new IOException();
|
|
}
|
|
|
|
String regex = "(/events/[0-9]*)(/\\?event_time_id=[0-9]*)?";
|
|
|
|
List<String> event_links_href = document
|
|
.getElementsByAttributeValueMatching("href", Pattern.compile(regex))
|
|
.eachAttr("href");
|
|
|
|
for (String link : event_links_href) {
|
|
this.event_links.add("https://www.facebook.com" + link);
|
|
}
|
|
|
|
SharedPreferences shared_prefs = PreferenceManager
|
|
.getDefaultSharedPreferences(scraper.main.get());
|
|
|
|
int max = shared_prefs.getInt("page_event_max", 5);
|
|
|
|
if (event_links.size() < max) {
|
|
|
|
try {
|
|
String next_url = document
|
|
.getElementsByAttributeValueMatching("href", "has_more=1")
|
|
.first().attr("href");
|
|
|
|
this.url = "https://mbasic.facebook.com" + next_url;
|
|
} catch (NullPointerException e) {
|
|
url = null;
|
|
event_links = event_links.subList(0, max);
|
|
}
|
|
|
|
|
|
} else {
|
|
url = null;
|
|
event_links = event_links.subList(0, max);
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
this.error = R.string.error_connection;
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
this.error = R.string.error_unknown;
|
|
}
|
|
} while (url != null);
|
|
|
|
return null;
|
|
}
|
|
|
|
@Override
|
|
protected void onPreExecute() {
|
|
super.onPreExecute();
|
|
}
|
|
|
|
/**
|
|
* When scraping is finished, the scraper callback will receive the link list.
|
|
*
|
|
* @param aVoid
|
|
*/
|
|
protected void onPostExecute(Void aVoid) {
|
|
super.onPostExecute(aVoid);
|
|
|
|
this.scraper.scrapePageResultCallback(this.event_links, this.error);
|
|
}
|
|
}
|
|
|