Allow input of arbitraty page name

Document receiver will check for 404 error if page is invalid.
If page is valid just scrape it as if it was the full uri.

closes #34
This commit is contained in:
akaessens 2021-08-10 14:46:44 +02:00
parent b4d37fbc3f
commit 7fdfd38cdc
4 changed files with 55 additions and 39 deletions

View File

@ -3,57 +3,56 @@ package com.akdev.nofbeventscraper;
import android.util.Log;
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DocumentReceiver {
public static org.jsoup.nodes.Document getDocument(String url) {
public static org.jsoup.nodes.Document getDocument(String url) throws HttpStatusException, IOException {
org.jsoup.nodes.Document document;
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
Log.d("scraperLog", "DocumentReceiver: " + url);
Connection connection = Jsoup.connect(url).userAgent(user_agent).followRedirects(true);
Connection.Response response = connection.execute();
document = response.parse();
Log.d("scraperLog", "Document title: " + document.title());
try {
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
// accept cookies needed?
Element form = document.select("form[method=post]").first();
String action = form.attr("action");
Log.d("scraperLog", "DocumentReceiver: " + url);
List<String> names = form.select("input").eachAttr("name");
List<String> values = form.select("input").eachAttr("value");
Connection connection = Jsoup.connect(url).userAgent(user_agent).followRedirects(true);
Map<String, String> data = new HashMap<String, String>();
Connection.Response response = connection.execute();
document = response.parse();
Log.d("scraperLog", "Document title: " + document.title());
try {
// accept cookies needed?
Element form = document.select("form[method=post]").first();
String action = form.attr("action");
List<String> names = form.select("input").eachAttr("name");
List<String> values = form.select("input").eachAttr("value");
Map<String, String> data = new HashMap<String, String>();
for (int i = 0; i < names.size(); i++) {
data.put(names.get(i), values.get(i));
}
document = connection.url("https://mbasic.facebook.com" + action)
.cookies(response.cookies())
.method(Connection.Method.POST)
.data(data)
.post();
} catch (Exception ignore) {
for (int i = 0; i < names.size(); i++) {
data.put(names.get(i), values.get(i));
}
} catch (Exception e) {
e.printStackTrace();
return null;
document = connection.url("https://mbasic.facebook.com" + action)
.cookies(response.cookies())
.method(Connection.Method.POST)
.data(data)
.post();
} catch (Exception ignore) {
}
return document;
}

View File

@ -5,6 +5,7 @@ import android.util.Log;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.HttpStatusException;
import org.jsoup.nodes.Document;
import java.io.IOException;
@ -146,9 +147,8 @@ public class FbEventScraper extends AsyncTask<Void, Void, Void> {
Log.d("scraperLog", "doInBackground: "+url);
Document document = DocumentReceiver.getDocument(url);
try {
Document document = DocumentReceiver.getDocument(url);
if (document == null) {
throw new IOException();
}
@ -191,7 +191,10 @@ public class FbEventScraper extends AsyncTask<Void, Void, Void> {
this.event = new FbEvent(url, name, start_date, end_date, description, location, image_url);
} catch (IOException e) {
} catch (HttpStatusException e) {
this.error = R.string.error_url;
}
catch (IOException e) {
e.printStackTrace();
this.error = R.string.error_connection;
} catch (Exception e) {

View File

@ -5,6 +5,7 @@ import android.os.AsyncTask;
import androidx.preference.PreferenceManager;
import org.jsoup.HttpStatusException;
import org.jsoup.nodes.Document;
import java.io.IOException;
@ -95,7 +96,9 @@ public class FbPageScraper extends AsyncTask<Void, Void, Void> {
url = null;
event_links = event_links.subList(0, max);
}
} catch (HttpStatusException e) {
this.error = R.string.error_url;
return null;
} catch (IOException e) {
e.printStackTrace();
this.error = R.string.error_connection;

View File

@ -264,6 +264,17 @@ public class FbScraper {
url_type = url_type_enum.PAGE;
scrapePage(page_url);
return;
} catch (URISyntaxException | MalformedURLException e) {
url_type = url_type_enum.INVALID;
}
// check if only page name without prefix
try {
String page_url = getPageUrl("https://mbasic.facebook.com/"+input_url);
url_type = url_type_enum.PAGE;
scrapePage(page_url);
} catch (URISyntaxException | MalformedURLException e) {
url_type = url_type_enum.INVALID;
main.get().input_helper(main.get().getString(R.string.error_url), true);