Why Scrape Amazon?
Amazon is the world's largest e-commerce platform with millions of products across every category. Businesses scrape Amazon for:
- Price monitoring — Track competitor prices in real-time
- Product research — Analyze market trends and demand
- Review analysis — Understand customer sentiment
- Inventory tracking — Monitor stock availability
- Lead generation — Find sellers and manufacturers
⚠️ Important
Always scrape responsibly and respect Amazon's Terms of Service. This guide is for educational purposes and legitimate business use cases.
Step 1: Setting Up Your Environment
Install Required Libraries
# Create virtual environment
python -m venv amazon-scraper
source amazon-scraper/bin/activate # Linux/Mac
# amazon-scraper\Scripts\activate # Windows
# Install dependencies
pip install requests beautifulsoup4 lxml pandas fake-useragent
Project Structure
amazon-scraper/
├── scraper.py # Main scraper script
├── config.py # Proxy configuration
├── parsers.py # HTML parsing functions
├── data/ # Output folder
│ └── products.csv
└── requirements.txt
Step 2: Configure Proxies (Critical!)
Why Proxies Are Essential
Amazon blocks scrapers aggressively. Without proxies, you'll be blocked within 10-20 requests. Residential proxies have the highest success rate (95%+).
Proxy Configuration with Dexodata
# config.py
PROXY_CONFIG = {
'host': 'proxy.dexodata.com',
'port': 10000,
'username': 'your_username',
'password': 'your_password'
}
def get_proxy():
proxy_url = f"http://{PROXY_CONFIG['username']}:{PROXY_CONFIG['password']}@{PROXY_CONFIG['host']}:{PROXY_CONFIG['port']}"
return {
'http': proxy_url,
'https': proxy_url
}
Rotating User Agents
from fake_useragent import UserAgent
ua = UserAgent()
def get_headers():
return {
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
Step 3: Build the Scraper
Basic Product Scraper
# scraper.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from config import get_proxy, get_headers
class AmazonScraper:
def __init__(self):
self.base_url = 'https://www.amazon.com'
self.products = []
def scrape_product(self, asin):
"""Scrape a single product by ASIN"""
url = f'{self.base_url}/dp/{asin}'
try:
response = requests.get(
url,
headers=get_headers(),
proxies=get_proxy(),
timeout=30
)
if response.status_code == 200:
return self.parse_product(response.text, asin)
else:
print(f'Error {response.status_code} for {asin}')
return None
except Exception as e:
print(f'Exception for {asin}: {e}')
return None
def parse_product(self, html, asin):
"""Extract product data from HTML"""
soup = BeautifulSoup(html, 'lxml')
product = {
'asin': asin,
'title': self.get_title(soup),
'price': self.get_price(soup),
'rating': self.get_rating(soup),
'reviews_count': self.get_reviews_count(soup),
'availability': self.get_availability(soup),
}
return product
def get_title(self, soup):
title = soup.select_one('#productTitle')
return title.text.strip() if title else None
def get_price(self, soup):
price = soup.select_one('.a-price .a-offscreen')
return price.text.strip() if price else None
def get_rating(self, soup):
rating = soup.select_one('#acrPopover')
if rating:
return rating.get('title', '').split()[0]
return None
def get_reviews_count(self, soup):
count = soup.select_one('#acrCustomerReviewText')
if count:
return count.text.split()[0].replace(',', '')
return None
def get_availability(self, soup):
avail = soup.select_one('#availability span')
return avail.text.strip() if avail else None
Step 4: Scrape Search Results
def scrape_search(self, keyword, max_pages=5):
"""Scrape products from search results"""
all_asins = []
for page in range(1, max_pages + 1):
url = f'{self.base_url}/s?k={keyword}&page={page}'
response = requests.get(
url,
headers=get_headers(),
proxies=get_proxy(),
timeout=30
)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
asins = self.extract_asins(soup)
all_asins.extend(asins)
print(f'Page {page}: Found {len(asins)} products')
# Random delay to avoid detection
time.sleep(random.uniform(2, 5))
return all_asins
def extract_asins(self, soup):
"""Extract ASINs from search results page"""
products = soup.select('[data-asin]')
asins = []
for product in products:
asin = product.get('data-asin')
if asin and len(asin) == 10:
asins.append(asin)
return asins
Step 5: Anti-Detection Techniques
🔄 Rotate Proxies
Use a pool of residential IPs and rotate them every few requests.
⏱️ Random Delays
Add 2-10 second delays between requests to mimic human behavior.
🎭 Rotate User Agents
Use different browser signatures for each request.
🍪 Handle Cookies
Maintain session cookies like a real browser would.
import random
import time
def smart_delay():
"""Human-like random delay"""
base_delay = random.uniform(2, 5)
# Occasionally add longer pauses
if random.random() < 0.1:
base_delay += random.uniform(5, 15)
time.sleep(base_delay)
def handle_captcha(response):
"""Detect and handle CAPTCHA pages"""
if 'captcha' in response.text.lower():
print('CAPTCHA detected! Switching proxy...')
return True
return False
Step 6: Save and Export Data
def save_to_csv(self, filename='data/products.csv'):
"""Save scraped products to CSV"""
df = pd.DataFrame(self.products)
df.to_csv(filename, index=False)
print(f'Saved {len(self.products)} products to {filename}')
# Main execution
if __name__ == '__main__':
scraper = AmazonScraper()
# Scrape search results
keyword = 'wireless headphones'
asins = scraper.scrape_search(keyword, max_pages=3)
# Scrape individual products
for asin in asins[:50]: # Limit to 50 products
product = scraper.scrape_product(asin)
if product:
scraper.products.append(product)
smart_delay()
# Save results
scraper.save_to_csv()
Best Practices Summary
Frequently Asked Questions
Scraping publicly available data from Amazon is generally legal, but you must respect their Terms of Service and robots.txt. Use the data for legitimate purposes like price monitoring and market research, and avoid collecting personal data without consent.
Amazon has sophisticated anti-bot protection that detects and blocks scrapers. Without proxies, your IP will be blocked after just a few requests. Residential proxies are most effective as they appear as regular shoppers.
You can scrape product titles, prices, descriptions, images, reviews, ratings, seller information, Best Seller Rank (BSR), stock availability, and more. This data is valuable for competitive analysis and pricing optimization.
For dynamic pricing, daily scraping is common. For competitive monitoring, 2-3 times per week may be sufficient. During sales events like Prime Day or Black Friday, hourly monitoring is recommended.
Need Reliable Proxies for Amazon Scraping?
Dexodata offers ethically-sourced residential proxies with 95%+ success rate on Amazon.