Fix bug where the cookies need to be accepted for mbasic scraping

This commit is contained in:
akaessens 2021-03-14 20:13:45 +01:00
parent 2efaafa38b
commit 08c1040679
3 changed files with 58 additions and 9 deletions

View File

@ -0,0 +1,53 @@
package com.akdev.nofbeventscraper;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DocumentReceiver {
public static org.jsoup.nodes.Document getDocument(String url) {
org.jsoup.nodes.Document document;
try {
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
Connection connection = Jsoup.connect(url).userAgent(user_agent).followRedirects(true);
Connection.Response response = connection.execute();
document = response.parse();
try {
// accept cookies needed?
Element form = document.select("form[method=post]").first();
String action = form.attr("action");
List<String> names = form.select("input").eachAttr("name");
List<String> values = form.select("input").eachAttr("value");
Map<String, String> data = new HashMap<String, String>();
for (int i = 0; i < names.size(); i++) {
data.put(names.get(i), values.get(i));
}
document = connection.url("https://mbasic.facebook.com" + action)
.cookies(response.cookies())
.method(Connection.Method.POST)
.data(data)
.post();
} catch (Exception ignore) {
}
} catch (Exception e) {
return null;
}
return document;
}
}

View File

@ -4,7 +4,6 @@ import android.os.AsyncTask;
import org.json.JSONException; import org.json.JSONException;
import org.json.JSONObject; import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.io.IOException; import java.io.IOException;
@ -144,11 +143,9 @@ public class FbEventScraper extends AsyncTask<Void, Void, Void> {
@Override @Override
protected Void doInBackground(Void... voids) { protected Void doInBackground(Void... voids) {
try { Document document = DocumentReceiver.getDocument(url);
// use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
Document document = Jsoup.connect(url).userAgent(user_agent).get();
try {
if (document == null) { if (document == null) {
throw new IOException(); throw new IOException();
} }

View File

@ -5,7 +5,6 @@ import android.os.AsyncTask;
import androidx.preference.PreferenceManager; import androidx.preference.PreferenceManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.io.IOException; import java.io.IOException;
@ -51,8 +50,8 @@ public class FbPageScraper extends AsyncTask<Void, Void, Void> {
do { do {
try { try {
// use default android user agent // use default android user agent
String user_agent = "Mozilla/5.0 (X11; Linux x86_64)";
Document document = Jsoup.connect(url).userAgent(user_agent).get(); Document document = DocumentReceiver.getDocument(url);
if (document == null) { if (document == null) {
throw new IOException(); throw new IOException();
@ -80,7 +79,7 @@ public class FbPageScraper extends AsyncTask<Void, Void, Void> {
int max = shared_prefs.getInt("page_event_max", 5); int max = shared_prefs.getInt("page_event_max", 5);
if (event_links.size() < max) { if (event_links.size() < max) {
// find "next page // find next page
try { try {
String next_url = document String next_url = document
.getElementsByAttributeValueMatching("href", "has_more=1") .getElementsByAttributeValueMatching("href", "has_more=1")