scrape more events from pages

This commit is contained in:
akaessens 2020-09-27 14:23:45 +02:00
parent e549ca7676
commit 43913ccd21
5 changed files with 61 additions and 25 deletions

View File

@ -1,7 +1,10 @@
package com.akdev.nofbeventscraper;
import android.content.SharedPreferences;
import android.os.AsyncTask;
import androidx.preference.PreferenceManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@ -44,32 +47,50 @@ public class FbPageScraper extends AsyncTask<Void, Void, Void> {
@Override
protected Void doInBackground(Void... voids) {
try {
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
Document document = Jsoup.connect(url).userAgent(user_agent).get();
if (document == null) {
throw new IOException();
do {
try {
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
Document document = Jsoup.connect(url).userAgent(user_agent).get();
if (document == null) {
throw new IOException();
}
String regex = "(/events/[0-9]*)(/\\?event_time_id=[0-9]*)?";
List<String> event_links_href = document
.getElementsByAttributeValueMatching("href", Pattern.compile(regex))
.eachAttr("href");
for (String link : event_links_href) {
this.event_links.add("https://www.facebook.com" + link);
}
SharedPreferences shared_prefs = PreferenceManager
.getDefaultSharedPreferences(scraper.main.get());
int max = shared_prefs.getInt("page_event_max", 5);
if (event_links.size() < max) {
String next_url = document
.getElementsByAttributeValueMatching("href", "has_more=1")
.first().attr("href");
this.url = "https://mbasic.facebook.com" + next_url;
} else {
url = null;
}
} catch (IOException e) {
e.printStackTrace();
this.error = R.string.error_connection;
} catch (Exception e) {
e.printStackTrace();
this.error = R.string.error_unknown;
}
String regex = "(/events/[0-9]*)(/\\?event_time_id=[0-9]*)?";
List<String> event_links_href = document
.getElementsByAttributeValueMatching("href", Pattern.compile(regex))
.eachAttr("href");
for (String link : event_links_href) {
this.event_links.add("https://www.facebook.com" + link);
}
} catch (IOException e) {
e.printStackTrace();
this.error = R.string.error_connection;
} catch (Exception e) {
e.printStackTrace();
this.error = R.string.error_unknown;
}
} while (url != null);
return null;
}

View File

@ -22,7 +22,7 @@ public class FbScraper {
protected List<AsyncTask> tasks;
url_type_enum url_type = url_type_enum.EVENT;
private String input_url;
private WeakReference<MainActivity> main; // no context leak with WeakReference
protected WeakReference<MainActivity> main; // no context leak with WeakReference
/**
* Constructor with WeakReference to the main activity, to add events.

View File

@ -20,4 +20,6 @@
<string name="preferences_event_snackbar">"Veranstaltungen gelöscht "</string>
<string name="done">Fertig</string>
<string name="undo">Rückgängig</string>
<string name="preferences_page_event_max_summary">Maximale Anzahl Events, die von einer einzelnen Seite geladen werden sollen.</string>
<string name="preferences_page_event_max">Veranstaltungslimit für Seiten</string>
</resources>

View File

@ -30,9 +30,13 @@
<string name="preferences_event_setting">Clear event list</string>
<string name="preferences_event_snackbar">Events list cleared</string>
<string name="preferences_page_event_max_summary">Maximum amount of events scraped from a single page link.</string>
<string name="preferences_page_event_max">Page event limit</string>
<!-- others -->
<string name="event_placeholder" translatable="false">Placeholder</string>
<string name="done">Done</string>
<string name="undo">Undo</string>
</resources>

View File

@ -16,6 +16,15 @@
<PreferenceCategory app:title="@string/preferences_events_header">
<SeekBarPreference
android:defaultValue="5"
app:showSeekBarValue="true"
app:min="5"
android:max="30"
android:summary="@string/preferences_page_event_max_summary"
android:key="page_event_max"
android:title="@string/preferences_page_event_max" />
<Preference
android:key="event_reset"
android:title="@string/preferences_event_setting" />