Allow input of arbitraty page name
Document receiver will check for 404 error if page is invalid. If page is valid just scrape it as if it was the full uri. closes #34
This commit is contained in:
parent
b4d37fbc3f
commit
7fdfd38cdc
|
@ -3,57 +3,56 @@ package com.akdev.nofbeventscraper;
|
|||
import android.util.Log;
|
||||
|
||||
import org.jsoup.Connection;
|
||||
import org.jsoup.HttpStatusException;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class DocumentReceiver {
|
||||
|
||||
public static org.jsoup.nodes.Document getDocument(String url) {
|
||||
public static org.jsoup.nodes.Document getDocument(String url) throws HttpStatusException, IOException {
|
||||
|
||||
org.jsoup.nodes.Document document;
|
||||
|
||||
// use default android user agent
|
||||
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
|
||||
|
||||
Log.d("scraperLog", "DocumentReceiver: " + url);
|
||||
|
||||
Connection connection = Jsoup.connect(url).userAgent(user_agent).followRedirects(true);
|
||||
|
||||
Connection.Response response = connection.execute();
|
||||
|
||||
document = response.parse();
|
||||
|
||||
Log.d("scraperLog", "Document title: " + document.title());
|
||||
|
||||
try {
|
||||
// use default android user agent
|
||||
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
|
||||
// accept cookies needed?
|
||||
Element form = document.select("form[method=post]").first();
|
||||
String action = form.attr("action");
|
||||
|
||||
Log.d("scraperLog", "DocumentReceiver: " + url);
|
||||
List<String> names = form.select("input").eachAttr("name");
|
||||
List<String> values = form.select("input").eachAttr("value");
|
||||
|
||||
Connection connection = Jsoup.connect(url).userAgent(user_agent).followRedirects(true);
|
||||
Map<String, String> data = new HashMap<String, String>();
|
||||
|
||||
Connection.Response response = connection.execute();
|
||||
|
||||
document = response.parse();
|
||||
|
||||
Log.d("scraperLog", "Document title: " + document.title());
|
||||
|
||||
try {
|
||||
// accept cookies needed?
|
||||
Element form = document.select("form[method=post]").first();
|
||||
String action = form.attr("action");
|
||||
|
||||
List<String> names = form.select("input").eachAttr("name");
|
||||
List<String> values = form.select("input").eachAttr("value");
|
||||
|
||||
Map<String, String> data = new HashMap<String, String>();
|
||||
|
||||
for (int i = 0; i < names.size(); i++) {
|
||||
data.put(names.get(i), values.get(i));
|
||||
}
|
||||
|
||||
document = connection.url("https://mbasic.facebook.com" + action)
|
||||
.cookies(response.cookies())
|
||||
.method(Connection.Method.POST)
|
||||
.data(data)
|
||||
.post();
|
||||
|
||||
} catch (Exception ignore) {
|
||||
for (int i = 0; i < names.size(); i++) {
|
||||
data.put(names.get(i), values.get(i));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
|
||||
document = connection.url("https://mbasic.facebook.com" + action)
|
||||
.cookies(response.cookies())
|
||||
.method(Connection.Method.POST)
|
||||
.data(data)
|
||||
.post();
|
||||
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ import android.util.Log;
|
|||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.HttpStatusException;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -146,9 +147,8 @@ public class FbEventScraper extends AsyncTask<Void, Void, Void> {
|
|||
|
||||
Log.d("scraperLog", "doInBackground: "+url);
|
||||
|
||||
Document document = DocumentReceiver.getDocument(url);
|
||||
|
||||
try {
|
||||
Document document = DocumentReceiver.getDocument(url);
|
||||
if (document == null) {
|
||||
throw new IOException();
|
||||
}
|
||||
|
@ -191,7 +191,10 @@ public class FbEventScraper extends AsyncTask<Void, Void, Void> {
|
|||
|
||||
this.event = new FbEvent(url, name, start_date, end_date, description, location, image_url);
|
||||
|
||||
} catch (IOException e) {
|
||||
} catch (HttpStatusException e) {
|
||||
this.error = R.string.error_url;
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_connection;
|
||||
} catch (Exception e) {
|
||||
|
|
|
@ -5,6 +5,7 @@ import android.os.AsyncTask;
|
|||
|
||||
import androidx.preference.PreferenceManager;
|
||||
|
||||
import org.jsoup.HttpStatusException;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -95,7 +96,9 @@ public class FbPageScraper extends AsyncTask<Void, Void, Void> {
|
|||
url = null;
|
||||
event_links = event_links.subList(0, max);
|
||||
}
|
||||
|
||||
} catch (HttpStatusException e) {
|
||||
this.error = R.string.error_url;
|
||||
return null;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
this.error = R.string.error_connection;
|
||||
|
|
|
@ -264,6 +264,17 @@ public class FbScraper {
|
|||
url_type = url_type_enum.PAGE;
|
||||
scrapePage(page_url);
|
||||
|
||||
return;
|
||||
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
url_type = url_type_enum.INVALID;
|
||||
}
|
||||
// check if only page name without prefix
|
||||
try {
|
||||
String page_url = getPageUrl("https://mbasic.facebook.com/"+input_url);
|
||||
url_type = url_type_enum.PAGE;
|
||||
scrapePage(page_url);
|
||||
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
url_type = url_type_enum.INVALID;
|
||||
main.get().input_helper(main.get().getString(R.string.error_url), true);
|
||||
|
|
Loading…
Reference in New Issue