Compare commits

...

7 Commits

Author SHA1 Message Date
akaessens 7fdfd38cdc Allow input of arbitraty page name
Document receiver will check for 404 error if page is invalid.
If page is valid just scrape it as if it was the full uri.

closes #34
2021-08-10 14:49:06 +02:00
akaessens b4d37fbc3f Correctly exit if error in scraping instead of endless loop 2021-08-10 14:25:43 +02:00
akaessens 2b035b6975 Update dependencies 2021-08-10 14:05:04 +02:00
akaessens 32da94c275 More logging
Signed-off-by: akaessens <24660231+akaessens@users.noreply.github.com>
2021-08-10 14:04:28 +02:00
akaessens 866889db27 Increase page max to 100 as it has turned out to be stable
Related #33
2021-08-10 12:48:17 +02:00
akaessens 6248e79021 Replace asyncTasks execute with actual parallel async execution
Using executeOnExecutor(asyncTask.THREAD_POOL_EXECUTOR) all pages
are scraped in parallel. Related: #33

Note: with Android 11 the whole Aync Task is deprecated, therefore
needs to be replaced in the future.
2021-08-10 12:43:43 +02:00
akaessens e8893fd712 Add some logging 2021-08-10 12:33:46 +02:00
6 changed files with 83 additions and 44 deletions

View File

@ -25,11 +25,11 @@ android {
dependencies {
// androidx
implementation 'androidx.coordinatorlayout:coordinatorlayout:1.1.0'
implementation 'androidx.recyclerview:recyclerview:1.2.0'
implementation 'androidx.recyclerview:recyclerview:1.2.1'
implementation 'androidx.cardview:cardview:1.0.0'
implementation 'androidx.navigation:navigation-fragment:2.3.5'
implementation 'androidx.navigation:navigation-ui:2.3.5'
implementation 'androidx.appcompat:appcompat:1.2.0'
implementation 'androidx.appcompat:appcompat:1.3.1'
implementation 'androidx.preference:preference:1.1.1'
implementation "androidx.webkit:webkit:1.4.0"
@ -37,10 +37,10 @@ dependencies {
implementation 'com.google.code.gson:gson:2.8.5'
// Theme
implementation 'com.google.android.material:material:1.3.0'
implementation 'com.google.android.material:material:1.4.0'
// Scraping
implementation 'org.jsoup:jsoup:1.13.1'
implementation 'org.jsoup:jsoup:1.14.1'
// Image loading and transforming
implementation 'com.squareup.picasso:picasso:2.71828'
@ -50,6 +50,6 @@ dependencies {
implementation 'jp.wasabeef:recyclerview-animators:3.0.0'
// tests
testImplementation 'junit:junit:4.12'
androidTestImplementation 'androidx.test.ext:junit:1.1.2'
androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0'
androidTestImplementation 'androidx.test.ext:junit:1.1.3'
androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0'
}

View File

@ -1,52 +1,58 @@
package com.akdev.nofbeventscraper;
import android.util.Log;
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DocumentReceiver {
public static org.jsoup.nodes.Document getDocument(String url) {
public static org.jsoup.nodes.Document getDocument(String url) throws HttpStatusException, IOException {
org.jsoup.nodes.Document document;
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
Log.d("scraperLog", "DocumentReceiver: " + url);
Connection connection = Jsoup.connect(url).userAgent(user_agent).followRedirects(true);
Connection.Response response = connection.execute();
document = response.parse();
Log.d("scraperLog", "Document title: " + document.title());
try {
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
// accept cookies needed?
Element form = document.select("form[method=post]").first();
String action = form.attr("action");
Connection connection = Jsoup.connect(url).userAgent(user_agent).followRedirects(true);
List<String> names = form.select("input").eachAttr("name");
List<String> values = form.select("input").eachAttr("value");
Connection.Response response = connection.execute();
Map<String, String> data = new HashMap<String, String>();
document = response.parse();
try {
// accept cookies needed?
Element form = document.select("form[method=post]").first();
String action = form.attr("action");
List<String> names = form.select("input").eachAttr("name");
List<String> values = form.select("input").eachAttr("value");
Map<String, String> data = new HashMap<String, String>();
for (int i = 0; i < names.size(); i++) {
data.put(names.get(i), values.get(i));
}
document = connection.url("https://mbasic.facebook.com" + action)
.cookies(response.cookies())
.method(Connection.Method.POST)
.data(data)
.post();
} catch (Exception ignore) {
for (int i = 0; i < names.size(); i++) {
data.put(names.get(i), values.get(i));
}
} catch (Exception e) {
return null;
document = connection.url("https://mbasic.facebook.com" + action)
.cookies(response.cookies())
.method(Connection.Method.POST)
.data(data)
.post();
} catch (Exception ignore) {
}
return document;
}

View File

@ -1,9 +1,11 @@
package com.akdev.nofbeventscraper;
import android.os.AsyncTask;
import android.util.Log;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.HttpStatusException;
import org.jsoup.nodes.Document;
import java.io.IOException;
@ -143,9 +145,10 @@ public class FbEventScraper extends AsyncTask<Void, Void, Void> {
@Override
protected Void doInBackground(Void... voids) {
Document document = DocumentReceiver.getDocument(url);
Log.d("scraperLog", "doInBackground: "+url);
try {
Document document = DocumentReceiver.getDocument(url);
if (document == null) {
throw new IOException();
}
@ -188,7 +191,10 @@ public class FbEventScraper extends AsyncTask<Void, Void, Void> {
this.event = new FbEvent(url, name, start_date, end_date, description, location, image_url);
} catch (IOException e) {
} catch (HttpStatusException e) {
this.error = R.string.error_url;
}
catch (IOException e) {
e.printStackTrace();
this.error = R.string.error_connection;
} catch (Exception e) {

View File

@ -5,6 +5,7 @@ import android.os.AsyncTask;
import androidx.preference.PreferenceManager;
import org.jsoup.HttpStatusException;
import org.jsoup.nodes.Document;
import java.io.IOException;
@ -95,13 +96,17 @@ public class FbPageScraper extends AsyncTask<Void, Void, Void> {
url = null;
event_links = event_links.subList(0, max);
}
} catch (HttpStatusException e) {
this.error = R.string.error_url;
return null;
} catch (IOException e) {
e.printStackTrace();
this.error = R.string.error_connection;
return null;
} catch (Exception e) {
e.printStackTrace();
this.error = R.string.error_unknown;
return null;
}
} while (url != null);

View File

@ -2,6 +2,7 @@ package com.akdev.nofbeventscraper;
import android.content.SharedPreferences;
import android.os.AsyncTask;
import android.util.Log;
import androidx.preference.PreferenceManager;
@ -152,8 +153,10 @@ public class FbScraper {
*/
void scrapeEvent(String event_url) {
FbEventScraper scraper = new FbEventScraper(this, event_url);
Log.d("scraperLog", "scrapeEvent: "+event_url);
tasks.add(scraper);
scraper.execute();
scraper.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
}
/**
@ -165,6 +168,7 @@ public class FbScraper {
void scrapeEventResultCallback(FbEvent event, int error) {
if (event != null) {
Log.d("scraperLog", "scrapeEventResultCallback: "+event.url);
main.get().addEvent(event);
main.get().input_helper(main.get().getString(R.string.done), false);
} else if (url_type == url_type_enum.EVENT) {
@ -180,8 +184,10 @@ public class FbScraper {
void scrapePage(String page_url) {
FbPageScraper scraper = new FbPageScraper(this, page_url);
Log.d("scraperLog", "scrapePage: "+page_url);
tasks.add(scraper);
scraper.execute();
scraper.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
}
/**
@ -193,10 +199,11 @@ public class FbScraper {
protected void scrapePageResultCallback(List<String> event_urls, int error) {
if (event_urls.size() > 0) {
Log.d("scraperLog", "scrapePageResultCallback: "+event_urls.toString());
for (String event_url : event_urls) {
try {
String url = getEventUrl(event_url);
Log.d("scraperLog", "scrapePageResultCallback: "+url);
scrapeEvent(url);
} catch (URISyntaxException | MalformedURLException e) {
// ignore this event
@ -210,11 +217,15 @@ public class FbScraper {
protected void redirectUrl (String url) {
FbRedirectionResolver resolver = new FbRedirectionResolver(this, url);
resolver.execute();
Log.d("scraperLog", "redirectUrl: "+url);
resolver.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
}
protected void redirectionResultCallback(String url) {
this.input_url = url;
Log.d("scraperLog", "redirectUrlCb: "+url);
// now try again with expanded url
this.run();
}
@ -253,6 +264,17 @@ public class FbScraper {
url_type = url_type_enum.PAGE;
scrapePage(page_url);
return;
} catch (URISyntaxException | MalformedURLException e) {
url_type = url_type_enum.INVALID;
}
// check if only page name without prefix
try {
String page_url = getPageUrl("https://mbasic.facebook.com/"+input_url);
url_type = url_type_enum.PAGE;
scrapePage(page_url);
} catch (URISyntaxException | MalformedURLException e) {
url_type = url_type_enum.INVALID;
main.get().input_helper(main.get().getString(R.string.error_url), true);

View File

@ -20,7 +20,7 @@
android:defaultValue="5"
app:showSeekBarValue="true"
app:min="1"
android:max="30"
android:max="100"
android:summary="@string/preferences_page_event_max_summary"
android:key="page_event_max"
android:title="@string/preferences_page_event_max" />