From 43913ccd2185da7f42572f709f7a052326499a1b Mon Sep 17 00:00:00 2001 From: akaessens <24660231+akaessens@users.noreply.github.com> Date: Sun, 27 Sep 2020 14:23:45 +0200 Subject: [PATCH] scrape more events from pages --- .../akdev/nofbeventscraper/FbPageScraper.java | 69 ++++++++++++------- .../com/akdev/nofbeventscraper/FbScraper.java | 2 +- app/src/main/res/values-de/strings.xml | 2 + app/src/main/res/values/strings.xml | 4 ++ app/src/main/res/xml/root_preferences.xml | 9 +++ 5 files changed, 61 insertions(+), 25 deletions(-) diff --git a/app/src/main/java/com/akdev/nofbeventscraper/FbPageScraper.java b/app/src/main/java/com/akdev/nofbeventscraper/FbPageScraper.java index 1417a6d..06b8b17 100644 --- a/app/src/main/java/com/akdev/nofbeventscraper/FbPageScraper.java +++ b/app/src/main/java/com/akdev/nofbeventscraper/FbPageScraper.java @@ -1,7 +1,10 @@ package com.akdev.nofbeventscraper; +import android.content.SharedPreferences; import android.os.AsyncTask; +import androidx.preference.PreferenceManager; + import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -44,32 +47,50 @@ public class FbPageScraper extends AsyncTask { @Override protected Void doInBackground(Void... voids) { - try { - // use default android user agent - String user_agent = "Mozilla/5.0 (X11; Linux x86_64)"; - Document document = Jsoup.connect(url).userAgent(user_agent).get(); - if (document == null) { - throw new IOException(); + do { + try { + // use default android user agent + String user_agent = "Mozilla/5.0 (X11; Linux x86_64)"; + Document document = Jsoup.connect(url).userAgent(user_agent).get(); + + if (document == null) { + throw new IOException(); + } + + String regex = "(/events/[0-9]*)(/\\?event_time_id=[0-9]*)?"; + + List event_links_href = document + .getElementsByAttributeValueMatching("href", Pattern.compile(regex)) + .eachAttr("href"); + + for (String link : event_links_href) { + this.event_links.add("https://www.facebook.com" + link); + } + + SharedPreferences shared_prefs = PreferenceManager + .getDefaultSharedPreferences(scraper.main.get()); + + int max = shared_prefs.getInt("page_event_max", 5); + + if (event_links.size() < max) { + String next_url = document + .getElementsByAttributeValueMatching("href", "has_more=1") + .first().attr("href"); + + this.url = "https://mbasic.facebook.com" + next_url; + } else { + url = null; + } + + } catch (IOException e) { + e.printStackTrace(); + this.error = R.string.error_connection; + } catch (Exception e) { + e.printStackTrace(); + this.error = R.string.error_unknown; } - - String regex = "(/events/[0-9]*)(/\\?event_time_id=[0-9]*)?"; - - List event_links_href = document - .getElementsByAttributeValueMatching("href", Pattern.compile(regex)) - .eachAttr("href"); - - for (String link : event_links_href) { - this.event_links.add("https://www.facebook.com" + link); - } - - } catch (IOException e) { - e.printStackTrace(); - this.error = R.string.error_connection; - } catch (Exception e) { - e.printStackTrace(); - this.error = R.string.error_unknown; - } + } while (url != null); return null; } diff --git a/app/src/main/java/com/akdev/nofbeventscraper/FbScraper.java b/app/src/main/java/com/akdev/nofbeventscraper/FbScraper.java index 7a5639f..7b46c49 100644 --- a/app/src/main/java/com/akdev/nofbeventscraper/FbScraper.java +++ b/app/src/main/java/com/akdev/nofbeventscraper/FbScraper.java @@ -22,7 +22,7 @@ public class FbScraper { protected List tasks; url_type_enum url_type = url_type_enum.EVENT; private String input_url; - private WeakReference main; // no context leak with WeakReference + protected WeakReference main; // no context leak with WeakReference /** * Constructor with WeakReference to the main activity, to add events. diff --git a/app/src/main/res/values-de/strings.xml b/app/src/main/res/values-de/strings.xml index 5278d7c..754de86 100644 --- a/app/src/main/res/values-de/strings.xml +++ b/app/src/main/res/values-de/strings.xml @@ -20,4 +20,6 @@ "Veranstaltungen gelöscht " Fertig Rückgängig + Maximale Anzahl Events, die von einer einzelnen Seite geladen werden sollen. + Veranstaltungslimit für Seiten \ No newline at end of file diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml index c804608..285e730 100644 --- a/app/src/main/res/values/strings.xml +++ b/app/src/main/res/values/strings.xml @@ -30,9 +30,13 @@ Clear event list Events list cleared + Maximum amount of events scraped from a single page link. + Page event limit + Placeholder Done Undo + diff --git a/app/src/main/res/xml/root_preferences.xml b/app/src/main/res/xml/root_preferences.xml index 4f0f632..d7d18e0 100644 --- a/app/src/main/res/xml/root_preferences.xml +++ b/app/src/main/res/xml/root_preferences.xml @@ -16,6 +16,15 @@ + +