How to Scrape Amazon With Puppeteer
Amazon is the largest online retailer in the world and one of the largest overall retailers in the world. If you're looking for anything online, you'll probably check Amazon first.
Amazon offers a wealth of information ranging from product details and pricing to customer reviews and market trends.
In this guide, we'll take you through how to scrape Amazon using Puppeteer.
- [TLDR - How to Scrape Amazon](#tldr
Need help scraping the web?
Then check out ScrapeOps, the complete toolkit for web scraping.
---how-to-scrape-amazon)
- How To Architect Our Scraper
- Understanding How To Scrape Amazon
- Setting Up Our Amazon Scraper
- Build a Search Results Crawler
- Build a Product Parser
- Legal and Ethical Considerations
- Conclusion
- More Cool Articles
TLDR - How to Scrape Amazon
If you are looking for a production-ready Amazon scraper, follow the script below:
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function resultCrawl(browser, productName, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
const proxyUrl = getScrapeOpsUrl(url, location);
console.log(proxyUrl);
await page.goto(proxyUrl);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
await writeToCsv([item], `${productName}.csv`);
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, concurrencyLimit, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => resultCrawl(browser, query, page, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log(`Failed to process batch: ${e}`);
}
}
console.log("Concurrent crawl finished");
}
async function parseProduct(browser, productObject, location="us", retries=3) {
const productUrl = productObject.url;
const proxyUrl = getScrapeOpsUrl(productUrl, location);
console.log("Proxy url:", proxyUrl);
let tries = 0;
let success = false;
const urlArray = productUrl.split("/");
const title = urlArray[urlArray.length-4];
const asin = urlArray[urlArray.length-2];
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(proxyUrl, {timeout: 60000});
const imagesToSave = [];
const features = [];
const images = await page.$$("li img");
for (const image of images) {
const imageLink = await page.evaluate(element => element.getAttribute("src"), image);
if (imageLink.includes("https://m.media-amazon.com/images/I/")) {
imagesToSave.push(imageLink);
}
}
const featureBullets = await page.$$("li.a-spacing-mini");
for (const feature of featureBullets) {
const span = await feature.$("span");
const text = await page.evaluate(span => span.textContent, span);
if (!features.includes(text)) {
features.push(text);
}
}
const priceSymbolElement = await page.$("span.a-price-symbol");
const priceWholeElement = await page.$("span.a-price-whole");
const priceDecimalElement = await page.$("span.a-price-fraction");
const priceSymbol = await page.evaluate(element => element.textContent, priceSymbolElement);
const priceWhole = (await page.evaluate(element => element.textContent, priceWholeElement)).replace(",", "").replace(".", "");
const priceDecimal = await page.evaluate(element => element.textContent, priceDecimalElement);
const price = Number(`${priceWhole}.${priceDecimal}`);
if (imagesToSave.length > 0) {
const item = {
asin: asin,
title: title,
url: productUrl,
pricing_unit: priceSymbol,
price: price,
feature_1: features[0],
feature_2: features[1],
feature_3: features[2],
feature_4: features[3],
images_1: imagesToSave[0],
images_2: imagesToSave[1],
images_3: imagesToSave[2],
images_4: imagesToSave[3]
}
await writeToCsv([item], `${item.title}.csv`);
console.log("Wrote to csv");
success = true;
} else {
await page.screenshot({path: `ERROR-${title}.png`});
throw new Error("Failed to find item details!");
}
} catch (e) {
console.log("ERROR:", e);
await page.screenshot({path: "error.png", fullPage: true});
console.log(`Failed page, Tries left: ${retries-tries}`);
tries++;
} finally {
await page.close();
}
}
return;
}
async function concurrentProductScrape(browser, inputFile, concurrencyLimit, location="us", retries=3) {
const productObjects = await readCsv(inputFile);
while (productObjects.length > 0) {
const currentBatch = productObjects.splice(0, concurrencyLimit);
const tasks = currentBatch.map(productObject => parseProduct(browser, productObject, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log("Failed to process batch");
}
}
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 5;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
const fileName = `./${product}.csv`;
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await concurrentProductScrape(browser, fileName, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
The code above gives you a production ready Selenium scraper for Amazon... fully integrated with the ScrapeOps Proxy API!
To change your results, simply change your constants.
If you want detailed results on only one page of a search, change PAGES
to 1. If you wish to run with 10 threads, change CONCURRENCY_LIMIT
to 10... use caution with this one, each thread opens up another page in the proxy and ScrapeOps proxy does have a concurrency limit.
How To How To Architect Our Amazon Scraper
When we scrape Amazon, we need to pull valuable data from both our search results and individual item pages. When we search on Amazon, we get a bunch of pages and each page has a bunch of results.
Each item in our search also has its own page containing specific details about the item. You can get a better feel for these things if you take a look at the images below.
Results Page
Our Results page holds most of the information we want to scrape such as the product name, sale price, real price, and rating.
Product Page The Product page holds much of the information we already find in the Result page and more. In our case specifically, we need the Product page because it holds bullet points and images specific to the item we're looking at.
When we review our results, we find phones we're interested in. When we want to look at details for a specific phone, we look at the page for that phone.
Understanding How To Scrape Amazon
Before plunging head first into code, we're going to talk about how our scraper works on a high level. In this section, we're going over the required steps in greater detail. If you've got some experience in web scraping already, feel free to skip this section.
Step 1: How To Request Amazon Pages
Let's take a better look at the URL from the page we looked at earlier.
https://www.amazon.com/s?k=phone
is the portion you really need to pay attention to.
https://www.amazon.com/
is our base url.s?
shows that we're performing a search query.k=phone
tells the Amazon server that we want to look at phones. Their server takes all this information from the URL and sends us back a page of phones.
Step 2: How To Extract Data From Amazon Pages
While some sites store their data conveniently in a JSON blob, Amazon does not. Amazon nests their data deeply within divs and spans. To extract our data, we need to pull it from these elements nested within the HTML.
Let's first take a look at the Results page. Below, you can see an item title with the inspect window open. If you look closely, you'll see the title text is nested within a span
element.
Now, let's take a look at the product page. Look closely here as well. Our feature bullets are actually span
elements nested within li
(list) elements.
Step 3: How To Control Pagination
Controlling is a pretty simple task. It just requires an additional parameter to our URL. When pagination is added in, our URL will look like this:
https://www.amazon.com/s?k={product_name}&page={page_number}
So if we want to search page 1
of phones, this would be our URL:
https://www.amazon.com/s?k=phone&page=1
Step 4: Geolocated Data
Amazon does serve different content based on our location. If we're in the US, prices will be denoted in dollars, $
. If we're in the UK, Amazon will give us our prices in the pound, GBP
.
To control our location effectively, we'll be using the ScrapeOps Proxy API. The ScrapeOps API will route our traffic through servers in whichever country we ask for.
If we want to be in the UK, ScrapeOps will put us in the UK. If we want to be from the US, ScrapeOps will route us through servers in the US.
The ScrapeOps API is a perfect way to control your location because our requests are actually routed through the location we want.
Setting Up Our Amazon Scraper Project
Now that we know what we want to do, let's start building our scraper. First, we'll make a new project folder, and then we'll initialize a virtual environment and install dependencies.
Create a New Folder
mkdir amazon-scraper
From inside your new folder, create a new virtual environment.
Create a JavaScript Project
npm init --y
Install Dependencies
npm install puppeteer
npm install csv-writer
npm install csv-parse
npm install fs
Build A Amazon Search Crawler
The first portion of our project will be spent building a crawler to scrape Amazon search results. This crawler will actually be grabbing the bulk of our data. This crawler needs to:
- parse results
- manage result batches using pagination
- store results from those pages
- search multiple pages concurrently
- integrate with a proxy for both location support and anti-bot resistance
Our ideal crawler will fetch a page. It will parse the information from the page to give us good results. Then it'll store those results in files for us to look at later. On top of all these things, it needs to use concurrency for speed and efficiency and it also needs to use a proxy so we don't get blocked.
Step 1: Create Simple Search Data Parser
Let's get started by creating a crawler that simply parses a Results page. Here is a scraper with a simple parsing function.
- The parsing function below first finds all the
div
elements on the page. - Then it checks if each
div
isparsable
. If thediv
is parsable, we find itsh2
and use itstextContent
as ourtitle
. - We then and extract the following from each listing:
asin
title
url
is_ad
pricing_unit
price
real_price
rating
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function resultCrawl(browser, productName, retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}`;
await page.goto(url);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
await resultCrawl(browser, product, MAX_RETRIES)
await browser.close();
}
}
main();
If you run this example, you'll probably get blocked.
Amazon will likely continue to block us because we appear abnormal. We'll address this later on in our scraper when we add proxy support.
Step 2: Add Pagination
Now that we can parse a page, let's add pagination into our parsing function. Pagination gives us the ability to control our result batches. If we want page 1, fetch page 1. If we want page 2, fetch page 2... and so on and so forth.
The code example below is almost exactly the same as before. The major difference: we have a pageNumber
added to both our function arguments and our url.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function resultCrawl(browser, productName, pageNumber, retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
await page.goto(url);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
for (const page of pageList) {
await resultCrawl(browser, query, location, retries);
}
console.log("Concurrent crawl finished");
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 1;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
await concurrentCrawl(browser, product, PAGES, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
As you can see above, not much has changed at all in our code.
- Our function now takes a
pageNumber
and inserts it into our URL. - We also have a
concurrentCrawl()
function which acts as a placeholder... we'll add real concurrency later on. range()
simply allows us to create arange
element similar to the one commonly used in Python.
Step 3: Storing the Scraped Data
Now that our crawler can choose a page to scrape, it's time to give it the ability to store our data. In this section, we'll create a function for writing objects to a CSV file. We'll call this function on every item we parse a product.
By doing this, our CSV file is always up to date and we don't lose a batch just from one bad result.
Here is our updated code example.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function resultCrawl(browser, productName, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
await page.goto(url);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
await writeToCsv([item], `${productName}.csv`);
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
for (const page of pageList) {
await resultCrawl(browser, query, location, retries);
}
console.log("Concurrent crawl finished");
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 1;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
await concurrentCrawl(browser, product, PAGES, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
In the example above, we use our item
object to hold individual product data. We add a writeToCsv()
function as well. We call the write function on each objecct individually for the safest and best results. If the CSV file doesn't exist, a new one is created. If it does exist, we append the CSV file so we don't overwrite important data.
Step 4: Adding Concurrency
When we added pagination earlier, we gave our crawler the ability to scrape different pages. Now that we can scrape a specific page and store its data, it's time to give our crawler the power to scrape a bunch of pages at once. With concurrency, we can do exactly that.
Here is our concurrentCrawl()
function.
async function concurrentCrawl(browser, query, pages, concurrencyLimit, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => resultCrawl(browser, query, page, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log(`Failed to process batch: ${e}`);
}
}
console.log("Concurrent crawl finished");
}
We use concurrencyLimit
to manage our pages. If our concurrencyLimit
is set to 5, we'll scrape 5 pages at once. Be mindful this limit.
Not only does your machine have limits, but your ScrapeOps API key will likely also have a concurrency limit. You don't want to run past your limit... you'd just be wasting resources and you'll probably crash Puppeteer!
Here is our updated code. We also added a location
argument to resultCrawl()
. While we don't use the location in this example, we'll be using it in the next section when we add proxy support.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function resultCrawl(browser, productName, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
await page.goto(url);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
await writeToCsv([item], `${productName}.csv`);
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, concurrencyLimit, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => resultCrawl(browser, query, page, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log(`Failed to process batch: ${e}`);
}
}
console.log("Concurrent crawl finished");
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 1;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
We're almost ready, but not quite. If you run the code above, you'll still most likely get blocked. To an Amazon server, our scraper already looked a bit abnormal.
Now it's not only abnormal, it's exponentially faster than it was before. Let's add proxy support in the next section so we can avoid getting blocked.
Step 5: Bypassing Anti-Bots
We're almost ready for our production run. It's time to add proxy support so Amazon stops blocking our crawler. We really only need to add one function here, getScrapeOpsUrl()
.
This function takes in a regular url and uses basic string formatting to convert it into a url that uses the ScrapeOps API. Take a look below:
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
This function takes our url and formats it into a proxied url. Here is our updated code below.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function resultCrawl(browser, productName, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
const proxyUrl = getScrapeOpsUrl(url, location);
console.log(proxyUrl);
await page.goto(proxyUrl);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
await writeToCsv([item], `${productName}.csv`);
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, concurrencyLimit, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => resultCrawl(browser, query, page, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log(`Failed to process batch: ${e}`);
}
}
console.log("Concurrent crawl finished");
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 1;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
Now that we can get past anti-bots, we're ready to move on to our production run.
Step 6: Production Run
Time for our production run. Take a look at our main function below.
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 5;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
You can change any of the following constants to change your results:
PRODUCTS
MAX_RETRIES
PAGES
CONCURRENCY_LIMIT
LOCATION
To run this scraper, replace the filename below with whatever you chose to name yours.
node crawler-proxy
Our final crawler generated a report on 5 pages full of phones in 16.3 seconds. When running in production, be cautious of your CONCURRENCY_LIMIT
. If you begin running into errors, decrease your concurrency limit. The higher your limit is, the more resources you're using both on your machine and on your API key.
Here is the report it created:
Build An Amazon Product Scraper
Now it's time to build a scraper that looks up individual products. From these individual product pages, we need to extract feature bullets, prices, and images. This way, if you're interested in a product, simply pull up your report for that product!
Step 1: Create Simple Amazon Product Page Data Parser
Here's a parsing function that retrieves data from a product page. We're not ready to add it into our scraper because we need the ability to read the CSV we created earlier.
async function parseProduct(browser, productObject, location="us", retries=3) {
const productUrl = productObject.url;
let tries = 0;
let success = false;
const urlArray = productUrl.split("/");
const title = urlArray[urlArray.length-4];
const asin = urlArray[urlArray.length-2];
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(productUrl, {timeout: 60000});
const imagesToSave = [];
const features = [];
const images = await page.$$("li img");
for (const image of images) {
const imageLink = await page.evaluate(element => element.getAttribute("src"), image);
if (imageLink.includes("https://m.media-amazon.com/images/I/")) {
imagesToSave.push(imageLink);
}
}
const featureBullets = await page.$$("li.a-spacing-mini");
for (const feature of featureBullets) {
const span = await feature.$("span");
const text = await page.evaluate(span => span.textContent, span);
if (!features.includes(text)) {
features.push(text);
}
}
const priceSymbolElement = await page.$("span.a-price-symbol");
const priceWholeElement = await page.$("span.a-price-whole");
const priceDecimalElement = await page.$("span.a-price-fraction");
const priceSymbol = await page.evaluate(element => element.textContent, priceSymbolElement);
const priceWhole = (await page.evaluate(element => element.textContent, priceWholeElement)).replace(",", "").replace(".", "");
const priceDecimal = await page.evaluate(element => element.textContent, priceDecimalElement);
const price = Number(`${priceWhole}.${priceDecimal}`);
if (imagesToSave.length > 0) {
const item = {
asin: asin,
title: title,
url: productUrl,
pricing_unit: priceSymbol,
price: price,
feature_1: features[0],
feature_2: features[1],
feature_3: features[2],
feature_4: features[3],
images_1: imagesToSave[0],
images_2: imagesToSave[1],
images_3: imagesToSave[2],
images_4: imagesToSave[3]
}
console.log(`Item: ${JSON.stringify(item)}`);
success = true;
} else {
await page.screenshot({path: `ERROR-${title}.png`});
throw new Error("Failed to find item details!");
}
} catch (e) {
console.log("ERROR:", e);
await page.screenshot({path: "error.png", fullPage: true});
console.log(`Failed page, Tries left: ${retries-tries}`);
tries++;
} finally {
await page.close();
}
}
return;
}
In the above function, we pull the features and item images from the product page. These will be used in the individual report we generate for each product.
Step 2: Loading URLs To Scrape
Now it's time to give our code the ability to run. In order to parse these items, we need to read them from a CSV file and then pass them into our parse function.
The code example below adds a concurrentProductScrape()
function. At the moment, this function does not use concurrency. We just have a for
loop as a placeholder.
This function reads the CSV file and then passes each object from the file into parseProduct()
.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function resultCrawl(browser, productName, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
const proxyUrl = getScrapeOpsUrl(url, location);
console.log(proxyUrl);
await page.goto(proxyUrl);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
await writeToCsv([item], `${productName}.csv`);
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, concurrencyLimit, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => resultCrawl(browser, query, page, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log(`Failed to process batch: ${e}`);
}
}
console.log("Concurrent crawl finished");
}
async function parseProduct(browser, productObject, location="us", retries=3) {
const productUrl = productObject.url;
let tries = 0;
let success = false;
const urlArray = productUrl.split("/");
const title = urlArray[urlArray.length-4];
const asin = urlArray[urlArray.length-2];
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(productUrl, {timeout: 60000});
const imagesToSave = [];
const features = [];
const images = await page.$$("li img");
for (const image of images) {
const imageLink = await page.evaluate(element => element.getAttribute("src"), image);
if (imageLink.includes("https://m.media-amazon.com/images/I/")) {
imagesToSave.push(imageLink);
}
}
const featureBullets = await page.$$("li.a-spacing-mini");
for (const feature of featureBullets) {
const span = await feature.$("span");
const text = await page.evaluate(span => span.textContent, span);
if (!features.includes(text)) {
features.push(text);
}
}
const priceSymbolElement = await page.$("span.a-price-symbol");
const priceWholeElement = await page.$("span.a-price-whole");
const priceDecimalElement = await page.$("span.a-price-fraction");
const priceSymbol = await page.evaluate(element => element.textContent, priceSymbolElement);
const priceWhole = (await page.evaluate(element => element.textContent, priceWholeElement)).replace(",", "").replace(".", "");
const priceDecimal = await page.evaluate(element => element.textContent, priceDecimalElement);
const price = Number(`${priceWhole}.${priceDecimal}`);
if (imagesToSave.length > 0) {
const item = {
asin: asin,
title: title,
url: productUrl,
pricing_unit: priceSymbol,
price: price,
feature_1: features[0],
feature_2: features[1],
feature_3: features[2],
feature_4: features[3],
images_1: imagesToSave[0],
images_2: imagesToSave[1],
images_3: imagesToSave[2],
images_4: imagesToSave[3]
}
console.log(`Item: ${JSON.stringify(item)}`);
success = true;
} else {
await page.screenshot({path: `ERROR-${title}.png`});
throw new Error("Failed to find item details!");
}
} catch (e) {
console.log("ERROR:", e);
await page.screenshot({path: "error.png", fullPage: true});
console.log(`Failed page, Tries left: ${retries-tries}`);
tries++;
} finally {
await page.close();
}
}
return;
}
async function concurrentProductScrape(browser, inputFile, concurrencyLimit, location="us", retries=3) {
const productObjects = await readCsv(inputFile);
for (const productObject of productObjects) {
await parseProduct(browser, productObject, location, retries);
}
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 1;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
const fileName = `./${product}.csv`;
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await concurrentProductScrape(browser, fileName, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
You might be wondering why we use a separate field for each bullet point and image. The reason for this is actually pretty simple: our objects need a fixed amount of fields. By saving the first four features and images, this allows for uniform reporting and a set format for each CSV file.
Step 3: Storing the Scraped Data
Similar to how we stored our data with the results crawler, we're going to be using a custom class to hold our data. We'll then pass this object into our writeToCsv()
function to store our data.
We can safely pass each object into this write function just like we did earlier.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function resultCrawl(browser, productName, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
const proxyUrl = getScrapeOpsUrl(url, location);
console.log(proxyUrl);
await page.goto(proxyUrl);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
await writeToCsv([item], `${productName}.csv`);
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, concurrencyLimit, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => resultCrawl(browser, query, page, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log(`Failed to process batch: ${e}`);
}
}
console.log("Concurrent crawl finished");
}
async function parseProduct(browser, productObject, location="us", retries=3) {
const productUrl = productObject.url;
let tries = 0;
let success = false;
const urlArray = productUrl.split("/");
const title = urlArray[urlArray.length-4];
const asin = urlArray[urlArray.length-2];
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(productUrl, {timeout: 60000});
const imagesToSave = [];
const features = [];
const images = await page.$$("li img");
for (const image of images) {
const imageLink = await page.evaluate(element => element.getAttribute("src"), image);
if (imageLink.includes("https://m.media-amazon.com/images/I/")) {
imagesToSave.push(imageLink);
}
}
const featureBullets = await page.$$("li.a-spacing-mini");
for (const feature of featureBullets) {
const span = await feature.$("span");
const text = await page.evaluate(span => span.textContent, span);
if (!features.includes(text)) {
features.push(text);
}
}
const priceSymbolElement = await page.$("span.a-price-symbol");
const priceWholeElement = await page.$("span.a-price-whole");
const priceDecimalElement = await page.$("span.a-price-fraction");
const priceSymbol = await page.evaluate(element => element.textContent, priceSymbolElement);
const priceWhole = (await page.evaluate(element => element.textContent, priceWholeElement)).replace(",", "").replace(".", "");
const priceDecimal = await page.evaluate(element => element.textContent, priceDecimalElement);
const price = Number(`${priceWhole}.${priceDecimal}`);
if (imagesToSave.length > 0) {
const item = {
asin: asin,
title: title,
url: productUrl,
pricing_unit: priceSymbol,
price: price,
feature_1: features[0],
feature_2: features[1],
feature_3: features[2],
feature_4: features[3],
images_1: imagesToSave[0],
images_2: imagesToSave[1],
images_3: imagesToSave[2],
images_4: imagesToSave[3]
}
await writeToCsv([item], `${item.title}.csv`);
console.log("Wrote to csv");
success = true;
} else {
await page.screenshot({path: `ERROR-${title}.png`});
throw new Error("Failed to find item details!");
}
} catch (e) {
console.log("ERROR:", e);
await page.screenshot({path: "error.png", fullPage: true});
console.log(`Failed page, Tries left: ${retries-tries}`);
tries++;
} finally {
await page.close();
}
}
return;
}
async function concurrentProductScrape(browser, inputFile, concurrencyLimit, location="us", retries=3) {
const productObjects = await readCsv(inputFile);
for (const productObject of productObjects) {
await parseProduct(browser, productObject, location, retries);
}
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 5;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
const fileName = `./${product}.csv`;
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await concurrentProductScrape(browser, fileName, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
You may have also notice that from inside parseProduct()
, we open up an individual CSV file for each product. This way, we generate an individual report for each one of the products we scraped earlier with the crawler. If you want to see details about a specific item, you can just open the report for that item!!!
Step 4: Adding Concurrency
Now, we're going to add concurrency so we can parse multiple products at once. This is very similar to when we added concurrency to the crawler earlier.
Take a look at the function below, it's the finished version of concurrentProductScrape()
. In this function, we read our CSV file into an array.
We then continually shrink the array with splice()
. Each time a batch is scraped, the array shrinks and more memory is freed up. This allows for the scraper to actually speed up as time goes on.
async function concurrentProductScrape(browser, inputFile, concurrencyLimit, location="us", retries=3) {
const productObjects = await readCsv(inputFile);
while (productObjects.length > 0) {
const currentBatch = productObjects.splice(0, concurrencyLimit);
const tasks = currentBatch.map(productObject => parseProduct(browser, productObject, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log("Failed to process batch");
}
}
}
Aside from the small changes in this function, everything else remains the same. In the next example, we'll add proxy support so we can avoid getting blocked.
Step 5: Bypassing Anti-Bots
As you learned earlier in this article, Amazon will definitely block you if your scraper seems like it could be suspicious. Our scraper already looked abnormal, after adding concurrency, it looks really abnormal. In this example, we're going to change one line of code and make the entire thing work.
const proxyUrl = getScrapeOpsUrl(url, location);
From within parseProduct()
, we page.goto()
the proxied url instead of the normal one.
Here is the full code:
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = "YOUR-SUPER-SECRET-API-KEY";
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function resultCrawl(browser, productName, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries < retries && !success) {
const page = await browser.newPage();
try {
const url = `https://www.amazon.com/s?k=${productName}&page=${pageNumber}`;
const proxyUrl = getScrapeOpsUrl(url, location);
console.log(proxyUrl);
await page.goto(proxyUrl);
console.log(`Successfully fetched page: ${pageNumber}`);
const badDivs = await page.$$("div.AdHolder");
for (const div of badDivs) {
await page.evaluate(element => {
element.parentNode.removeChild(element);
}, div);
}
const divs = await page.$$("div > span > div");
console.log(`Div count: ${divs.length}`);
let lastTitle = "";
for (const div of divs) {
const h2 = await div.$("h2");
if (h2 === null) {
continue;
}
const a = await h2.$("a");
const parsable = h2 !== null && a !== null;
if (parsable) {
const title = await page.evaluate(element => element.textContent, h2);
if (title === lastTitle) {
continue;
}
console.log(`Title: ${title}`);
const productUrl = await page.evaluate(a => {
const url = a.getAttribute("href");
if (url.includes("https")) {
return url;
} else {
return `https://www.amazon.com${url}`;
}
}, a);
console.log(`Product url: ${productUrl}`);
const adStatus = productUrl.includes("sspa");
console.log(`Ad Status: ${adStatus}`);
const urlArray = productUrl.split("/");
const asin = urlArray[urlArray.length-2];
console.log(`Asin: ${asin}`);
const pricingUnit = await div.$("span.a-price-symbol");
const wholePrice = await div.$("span.a-price-whole");
const decimalPrice = await div.$("span.a-price-fraction");
if (pricingUnit === null || wholePrice === null || decimalPrice === null) {
console.log("Failed to find price!");
continue;
}
const priceSymbol = await page.evaluate(pricingUnit => pricingUnit.textContent, pricingUnit);
const wholeNumber = await page.evaluate(wholePrice => wholePrice.textContent, wholePrice);
const decimalNumber = await page.evaluate(decimalPrice => decimalPrice.textContent, decimalPrice)
const formattedWholeNumber = wholeNumber.replace(",", "").replace(".", "");
const price = Number(`${formattedWholeNumber}.${decimalNumber}`);
const realPricePresence = await div.$("span.a-price.a-text-price span");
let realPrice = 0.0;
if (realPricePresence !== null) {
const realPriceStr = await page.evaluate(realPricePresence => realPricePresence.textContent, realPricePresence);
realPrice = Number(realPriceStr.replace(priceSymbol, ""));
} else {
realPrice = price;
}
let rating = "n/a";
ratingPresence = await div.$("span.a-icon-alt");
if (ratingPresence !== null) {
rating = await page.evaluate(ratingPresence => ratingPresence.textContent, ratingPresence);
}
const item = {
asin: asin,
title: title,
url: productUrl,
is_ad: adStatus,
pricing_unit: priceSymbol,
price: price,
real_price: realPrice,
rating: rating
}
await writeToCsv([item], `${productName}.csv`);
console.log("Item:", item);
lastTitle = title;
}
}
success = true;
} catch (err) {
console.log(`ERROR: ${err}, PAGE ${pageNumber}`);
tries++;
} finally {
await page.close();
if (success) {
console.log(`Finished scraping page: ${pageNumber}`);
}
}
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function concurrentCrawl(browser, query, pages, concurrencyLimit, location="us", retries=3) {
console.log("Concurrent crawl started");
const pageList = range(1, pages+1);
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => resultCrawl(browser, query, page, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log(`Failed to process batch: ${e}`);
}
}
console.log("Concurrent crawl finished");
}
async function parseProduct(browser, productObject, location="us", retries=3) {
const productUrl = productObject.url;
const proxyUrl = getScrapeOpsUrl(productUrl, location);
console.log("Proxy url:", proxyUrl);
let tries = 0;
let success = false;
const urlArray = productUrl.split("/");
const title = urlArray[urlArray.length-4];
const asin = urlArray[urlArray.length-2];
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(proxyUrl, {timeout: 60000});
const imagesToSave = [];
const features = [];
const images = await page.$$("li img");
for (const image of images) {
const imageLink = await page.evaluate(element => element.getAttribute("src"), image);
if (imageLink.includes("https://m.media-amazon.com/images/I/")) {
imagesToSave.push(imageLink);
}
}
const featureBullets = await page.$$("li.a-spacing-mini");
for (const feature of featureBullets) {
const span = await feature.$("span");
const text = await page.evaluate(span => span.textContent, span);
if (!features.includes(text)) {
features.push(text);
}
}
const priceSymbolElement = await page.$("span.a-price-symbol");
const priceWholeElement = await page.$("span.a-price-whole");
const priceDecimalElement = await page.$("span.a-price-fraction");
const priceSymbol = await page.evaluate(element => element.textContent, priceSymbolElement);
const priceWhole = (await page.evaluate(element => element.textContent, priceWholeElement)).replace(",", "").replace(".", "");
const priceDecimal = await page.evaluate(element => element.textContent, priceDecimalElement);
const price = Number(`${priceWhole}.${priceDecimal}`);
if (imagesToSave.length > 0) {
const item = {
asin: asin,
title: title,
url: productUrl,
pricing_unit: priceSymbol,
price: price,
feature_1: features[0],
feature_2: features[1],
feature_3: features[2],
feature_4: features[3],
images_1: imagesToSave[0],
images_2: imagesToSave[1],
images_3: imagesToSave[2],
images_4: imagesToSave[3]
}
await writeToCsv([item], `${item.title}.csv`);
console.log("Wrote to csv");
success = true;
} else {
await page.screenshot({path: `ERROR-${title}.png`});
throw new Error("Failed to find item details!");
}
} catch (e) {
console.log("ERROR:", e);
await page.screenshot({path: "error.png", fullPage: true});
console.log(`Failed page, Tries left: ${retries-tries}`);
tries++;
} finally {
await page.close();
}
}
return;
}
async function concurrentProductScrape(browser, inputFile, concurrencyLimit, location="us", retries=3) {
const productObjects = await readCsv(inputFile);
while (productObjects.length > 0) {
const currentBatch = productObjects.splice(0, concurrencyLimit);
const tasks = currentBatch.map(productObject => parseProduct(browser, productObject, location, retries));
try {
await Promise.all(tasks);
} catch (e) {
console.log("Failed to process batch");
}
}
}
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 5;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
const fileName = `./${product}.csv`;
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await concurrentProductScrape(browser, fileName, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
main();
Step 6: Production Run
Time for the production run. I'm going to have PAGES
to 5 and time the operation from start to finish. If you'd like to change your results, feel free to change any of the constants from the main.
async function main() {
const PRODUCTS = ["phone"];
const MAX_RETRIES = 4;
const PAGES = 5;
const CONCURRENCY_LIMIT = 4;
const LOCATION = "us";
for (const product of PRODUCTS) {
const browser = await puppeteer.launch();
const fileName = `./${product}.csv`;
await concurrentCrawl(browser, product, PAGES, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await concurrentProductScrape(browser, fileName, CONCURRENCY_LIMIT, LOCATION, MAX_RETRIES);
await browser.close();
}
}
If you look at the image below, our full run took 4 minutes and 54 seconds. This was to generate the initial report and parse 85 items individually. Our scraper handles just shy of 20 items per minute... Far faster than a typical human could do!
Legal and Ethical Considerations
When scraping anything on the web, it's always important to respect the robots.txt and Terms of Service on the site you're scraping. For Amazon you can check the links below.
It's important to know that if you violate a site's terms, they can suspend or even delete your account. When scraping, don't reveal private data.
Public data (data that is not gated behind a login) is considered public, so feel free to scrape it. When scraping private data, always check the site's terms before creating your scraper to avoid legal troubles.
If you are concerned about the legality of your scraping project, it is best to consult an attorney.
Conclusion
You now know how to build both a Results crawler and a Product scraper. You also now have a decent understanding of how to use Puppeteer, use a proxy, and harness async
and Promise
for powerful concurrency. Take all these new skills and go build something cool!
Check the links below to know more about the tech stack used in this article.
More Python Web Scraping Guides
Wanna level up your scraping skills? Here at ScrapeOps, we have a ton of learning resources. Check our Puppeteer Web Scraping Playbook or take a look at some of the articles below: