Esp8266 Web scraping Google map

Hi there,

I tried web scraping on google map without success from my esp8266.
I tried with python from PC and it works but on my NodeMCU there doesn't seem to be enough buffer to hold this big string.

Here is my code:


#include <ESP8266WiFi.h>


const char* ssid = "mySSID";
const char* password = "myPassword";

const char* host = "www.google.com"; 

String url = "/maps/dir/Holsten's+Ice+Cream,+Chocolate+%26+Restaurant/Hilton+Meadowlands,+Two+Meadowlands+Plaza,+East+Rutherford,+NJ+07073,+United+States/@40.7980681,-74.1696393,14z/data=!4m14!4m13!1m5!1m1!1s0x89c2ff8eccccef2b:0x92a0fedfb95721e5!2m2!1d-74.1867349!2d40.8283166!1m5!1m1!1s0x89c257d534f587a9:0xc1913bca390e84f9!2m2!1d-74.0781257!2d40.8052015!3e3?entry=ttu";

void setup() {
  Serial.begin(115200);
  delay(10);

  // We start by connecting to a WiFi network

  Serial.println();
  Serial.println();
  Serial.print("Connecting to ");
  Serial.println(ssid);

  WiFi.begin(ssid, password);

  while (WiFi.status() != WL_CONNECTED) {
    delay(500);
    Serial.print(".");
  }

  Serial.println("");
  Serial.println("WiFi connected");
  Serial.println("IP address: ");
  Serial.println(WiFi.localIP());
}

int value = 0;

void loop() {
  if(value == 0){
    int linecounter=0;
    String vindchillString;
    float vindchill;
    String line="";
    ++value;

    Serial.print("connecting to ");
    Serial.println(host);

    // Use WiFiClient class to create TCP connections
    WiFiClientSecure client;
    client.setInsecure();
    const int httpPort = 443;
    if (!client.connect(host, httpPort)) {
      Serial.println("connection failed");
      delay(5000);
      return;
    }

    // give the esp a breather
    yield();

    Serial.print("Requesting URL: ");
    Serial.println(url);

    // This will send the request to the server
    // The header should look like this:
    /*
      > GET /data/stenomuseet.htm HTTP/1.1
      > Host: vejret.stenomuseet.dk
      > Connection: close
      >    
    */
    client.print(
                String("GET ") + url + " HTTP/1.1\r\n" +  //Server + protocol
                "Host: " + host + "\r\n" + // Host
                "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0"+ "\r\n" + // User agent
                "Connection: close\r\n" +  // I don't want to talk after you sent me the data.
                "\r\n"                     // Blank line at end of header.
                );
    
    delay(5000); //wait a while to let the server respond (increase on shitty connections)

  // if (httpCode == HTTP_CODE_OK || httpCode == HTTP_CODE_MOVED_PERMANENTLY)

    // Read all the lines of the reply from server
    while(client.available()){
      line = client.readStringUntil('\r'); //cariage return as delimiter
      Serial.println(line.substring(0, 200)); //print out every received line.
      Serial.println(line.length()); 
      linecounter++; //count up for every line we read.
      yield(); //Avoid a crash by letting the ESP core handle its wifi stuff.
    }

    Serial.println();
    Serial.println("closing connection");
    
    delay(10000); //run every 10 secound
  }
}

Serial log:

HTTP/1.1 200 OK
03:26:04.935 -> 15
03:26:04.935 -> 
03:26:04.935 -> Date: Sun, 24 Dec 2023 10:26:00 GMT
03:26:04.935 -> 36
...
03:26:28.684 -> 5
03:26:28.844 -> 
03:26:28.844 -> line-height:29px;min-width:54px;padding:0 8px;text-align:center;text-decoration:none!important;border-radius:2px;-moz-user-select:-moz-none}.gbqfba:focus{border:1px solid #4d90fe;outline:none;box-sha
03:26:28.844 -> 6175
03:26:28.844 -> 
03:26:28.844 -> 1413
03:26:28.844 -> 5
03:26:29.005 -> 
03:26:29.005 -> tant;padding-top:0;position:relative;top:310px}.gbqfh #gbqf{margin:auto;min-width:534px;padding:0 !important}.gbqfh #gbqfbw{display:none}.gbqfh #gbqfbwa{display:block}.gbqfh #gbqf{max-width:${elastic
03:26:29.005 -> 5140
03:26:29.005 -> 
03:26:29.005 -> 0
03:26:29.005 -> 2
03:26:29.005 -> 
03:26:29.005 -> 
03:26:29.005 -> 1
03:26:33.995 -> 
03:26:33.995 -> 
03:26:33.995 -> 1
03:26:33.995 -> 
03:26:33.995 -> closing connection

I'm just trying to extract the next coming bus/train info:
image

I'm sure there's an api for that, but in the past I've often received emails from Google saying I need to update this or that months or years after I've finished a project, and it's tiring, so I try to avoid Google Api if possible.

Is it possible to scrap a heavy html page like Google map from my ESP?

Thanks

I quickly tried your link with Postman and I suppose the problem is due to cookies.
Once you have accepted the disclaimer they remain stored on your PC and therefore the link works as you expect.
But with ESP this doesn't happen.

I think you have no choice, you have to use Google APIs

Okay, I'll check the Google API. Thanks for testing.
Another reason I wanted to have a working script is that I sometimes need to extract data from a website that doesn't have an API. Some lightweight websites work, but heavy websites don't.

Maybe there is a way to use Google Map with a cookieless approach or scrape another web site which use Map inside?

This topic was automatically closed 180 days after the last reply. New replies are no longer allowed.