How to Scrape Google Reviews With Requests and BeautifulSoup
Scraping Google Reviews is notoriously difficult. To start, Google Reviews are only accessible via Google Maps. On top of that, Google uses dynamic CSS selectors, the data gets loaded dynamically, and all of it is incredibly nested.
Today, we'll learn how to crawl Google Maps and then we'll learn how to get the reviews for each business found in our crawl. This information is incredibly useful, especially when you wish to collect aggregate data on different businesses.
- TLDR: How to Scrape Google Reviews
- How To Architect Our Scraper
- Understanding How To Scrape Google Reviews
- Setting Up Our Google Reviews Scraper
- Build A Google Reviews Search Crawler
- Build A Gogle Reviews Scraper
- Legal and Ethical Considerations
- Conclusion
- More Cool Articles
Need help scraping the web?
Then check out ScrapeOps, the complete toolkit for web scraping.
TLDR - How to Scrape Google Reviews
To scrape Google Reviews, we need to crawl Google Maps and create a list of businesses. Then, we need to lookup each business from the list and save the reviews. The code below does exactly this. To get started:
- Create a new project folder with a
config.json
file. - Then add your ScrapeOps API key to the file,
{"api_key": "your-super-secret-api-key"}
. - Copy/paste the code below into a new Python file and you're good to go!
- Run it with
python name_of_your_python_file.py
.
import os
import re
import csv
import requests
import json
import logging
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import concurrent.futures
from dataclasses import dataclass, field, fields, asdict
API_KEY = ""
with open("config.json", "r") as config_file:
config = json.load(config_file)
API_KEY = config["api_key"]
def get_scrapeops_url(url, location="us"):
payload = {
"api_key": API_KEY,
"url": url,
"country": location,
"wait": 5000,
}
proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload)
return proxy_url
## Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class SearchData:
name: str = ""
stars: float = 0
url: str = ""
rating_count: int = ""
def __post_init__(self):
self.check_string_fields()
def check_string_fields(self):
for field in fields(self):
# Check string fields
if isinstance(getattr(self, field.name), str):
# If empty set default text
if getattr(self, field.name) == "":
setattr(self, field.name, f"No {field.name}")
continue
# Strip any trailing spaces, etc.
value = getattr(self, field.name)
setattr(self, field.name, value.strip())
@dataclass
class ReviewData:
name: str = ""
stars: int = 0
time_left: str = ""
review_shortened: str = ""
def __post_init__(self):
self.check_string_fields()
def check_string_fields(self):
for field in fields(self):
# Check string fields
if isinstance(getattr(self, field.name), str):
# If empty set default text
if getattr(self, field.name) == "":
setattr(self, field.name, f"No {field.name}")
continue
# Strip any trailing spaces, etc.
value = getattr(self, field.name)
setattr(self, field.name, value.strip())
class DataPipeline:
def __init__(self, csv_filename="", storage_queue_limit=50):
self.names_seen = []
self.storage_queue = []
self.storage_queue_limit = storage_queue_limit
self.csv_filename = csv_filename
self.csv_file_open = False
def save_to_csv(self):
self.csv_file_open = True
data_to_save = []
data_to_save.extend(self.storage_queue)
self.storage_queue.clear()
if not data_to_save:
return
keys = [field.name for field in fields(data_to_save[0])]
file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0
with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file:
writer = csv.DictWriter(output_file, fieldnames=keys)
if not file_exists:
writer.writeheader()
for item in data_to_save:
writer.writerow(asdict(item))
self.csv_file_open = False
def is_duplicate(self, input_data):
if input_data.name in self.names_seen:
logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.")
return True
self.names_seen.append(input_data.name)
return False
def add_data(self, scraped_data):
if self.is_duplicate(scraped_data) == False:
self.storage_queue.append(scraped_data)
if len(self.storage_queue) >= self.storage_queue_limit and self.csv_file_open == False:
self.save_to_csv()
def close_pipeline(self):
if self.csv_file_open:
time.sleep(3)
if len(self.storage_queue) > 0:
self.save_to_csv()
def scrape_search_results(keyword, location, locality, data_pipeline=None, retries=3):
formatted_keyword = keyword.replace(" ", "+")
url = f"https://www.google.com/maps/search/{formatted_keyword}/@{locality},14z/data=!3m1!4b1?entry=ttu"
tries = 0
success = False
while tries <= retries and not success:
try:
scrapeops_proxy_url = get_scrapeops_url(url, location=location)
response = requests.get(scrapeops_proxy_url)
logger.info(f"Recieved [{response.status_code}] from: {url}")
if response.status_code != 200:
raise Exception(f"Failed request, Status Code {response.status_code}")
soup = BeautifulSoup(response.text, "html.parser")
business_links = soup.select("div div a")
excluded_words = ["Sign in"]
for business_link in business_links:
name = business_link.get("aria-label")
if not name or name in excluded_words or "Visit" in name:
continue
maps_link = business_link.get("href")
full_card = business_link.parent
rating_holder = full_card.select_one("span[role='img']")
rating = 0.0
rating_count = 0
if rating_holder:
rating_array = rating_holder.text.split("(")
rating = rating_array[0]
rating_count = int(rating_array[1].replace(")", "").replace(",", ""))
search_data = SearchData(
name=name,
stars=rating,
url=maps_link,
rating_count=rating_count
)
data_pipeline.add_data(search_data)
success = True
logger.info(f"Successfully parsed data from: {url}")
except Exception as e:
logger.error(f"An error occurred while processing page {url}: {e}")
logger.info(f"Retrying request for page: {url}, retries left {retries-tries}")
tries += 1
if not success:
raise Exception(f"Max Retries exceeded: {retries}")
def start_scrape(keyword, location, localities, data_pipeline=None, max_threads=5, retries=3):
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
executor.map(
scrape_search_results,
[keyword] * len(localities),
[location] * len(localities),
localities,
[data_pipeline] * len(localities),
[retries] * len(localities)
)
def process_business(row, location, retries=3):
url = row["url"]
tries = 0
success = False
while tries <= retries and not success:
response = requests.get(get_scrapeops_url(url, location=location))
try:
if response.status_code == 200:
logger.info(f"Status: {response.status_code}")
soup = BeautifulSoup(response.text, "html.parser")
main_card = soup.select_one("div[role='main']")
info_cards = soup.find_all("div", class_="MyEned")
review_pipeline = DataPipeline(csv_filename=f"{row['name']}.csv")
for card in info_cards:
review = card.text
full_card = card.parent.parent.parent.parent
reviewer_button = full_card.find("button")
name = reviewer_button.get("aria-label").replace("Photo of ", "")
rating_tag = full_card.select_one("span[role='img']")
stars = int(rating_tag.get("aria-label").replace(" stars", "").replace(" star", ""))
review_date = rating_tag.parent.find_all("span")[-1].text
review_data = ReviewData(
name=name,
stars=stars,
time_left=review_date,
review_shortened=review
)
review_pipeline.add_data(review_data)
review_pipeline.close_pipeline()
success = True
else:
logger.warning(f"Failed Response: {response.status_code}")
raise Exception(f"Failed Request, status code: {response.status_code}")
except Exception as e:
logger.error(f"Exception thrown: {e}")
logger.warning(f"Failed to process page: {row['url']}")
logger.warning(f"Retries left: {retries-tries}")
tries += 1
if not success:
raise Exception(f"Max Retries exceeded: {retries}")
else:
logger.info(f"Successfully parsed: {row['url']}")
def process_results(csv_file, location, max_threads=5, retries=3):
logger.info(f"processing {csv_file}")
with open(csv_file, newline="") as file:
reader = list(csv.DictReader(file))
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
executor.map(
process_business,
reader,
[location] * len(reader),
[retries] * len(reader)
)
if __name__ == "__main__":
MAX_RETRIES = 3
MAX_THREADS = 5
LOCATION = "us"
LOCALITIES = ["42.3,-83.5"]
logger.info(f"Crawl starting...")
## INPUT ---> List of keywords to scrape
keyword_list = ["restaurant"]
aggregate_files = []
## Job Processes
for keyword in keyword_list:
filename = keyword.replace(" ", "-")
crawl_pipeline = DataPipeline(csv_filename=f"{filename}.csv")
start_scrape(keyword, LOCATION, LOCALITIES, data_pipeline=crawl_pipeline, max_threads=MAX_THREADS, retries=MAX_RETRIES)
crawl_pipeline.close_pipeline()
aggregate_files.append(f"{filename}.csv")
logger.info(f"Crawl complete.")
for file in aggregate_files:
process_results(file, LOCATION, max_threads=MAX_THREADS, retries=MAX_RETRIES)
To customize your results, change any of the following:
MAX_RETRIES
: the max amount of retries for a parse.MAX_THREADS
: how many threads you'd like to use when parsing pages simultaneously.LOCATION
: the location you'd like to appear from.LOCALITIES
: the areas of the map you'd like to scrape. They need to be added in as latitude and longitude pairs.keyword_list
: the keywords you'd like to search the map for.
When you change your localities, you need to use latitude and longitude pairs.
How To Architect Our Google Reviews Scraper
As mentioned above, to get Google Reviews, we need to crawl Google Maps. Maps itself is really tricky to scrape. We need a locality (latitude and longitude), and we need to wait for dynamic content to load on the screen before getting our result.
If our proxy server properly loads our page, we can extract the following information for each business in the search: name
, stars
, url
, and rating_count
. Then, we'll save these to a CSV file. Then, our review scraper will go through and find reviews for each of these businesses.
Our Maps crawler will need to do the following:
- Lookup businesses in a certain locality and parse the results.
- Store the parsed data inside a CSV file.
- Concurrently parse multiple localities at once.
- Integrate with a proxy to get past Google's anti-bot systems
After our crawl, the Reviews scraper needs to perform these tasks:
- Read the CSV from the crawl into an array.
- Parse reviews from each business extracted during the crawl.
- Store review data for each business.
- Concurrently parse and store this data.
- Use proxy integration to get past anything that might block us.