Did you ever have to analyze your competitor to improve your competitive edge? Where do you even start? One place to start is web scraping!
Below is a hypothetical code example using our [web scraping API](https://proxyscrape.com/products/web-scraping-api/) to analyze landing pages of a competitor.
```python
import os
import time
import json
import aiohttp
import asyncio
from typing import List, Union
import base64
import requests
from bs4 import BeautifulSoup
# Replace with your ProxyScrape API key
api_key = "Your API key"
# Function to handle relative URLs by adding the base URL
def handle_relative_url(base_url, url):
if url.startswith("http"):
return url
else:
return base_url + url
async def send_request_ASYNC(data) -> dict or None:
webscraping_api_endpoint = "https://api.proxyscrape.com/v3/accounts/freebies/scraperapi/request"
headers = {
"Content-Type": "application/json",
"X-Api-Key": api_key
}
retries = 3
for i in range(retries):
try:
async with aiohttp.ClientSession() as session:
async with session.post(webscraping_api_endpoint, headers=headers, json=data[0]) as response:
if response.status == 200:
print(
'URL:', data[0]['url'],
'STATUS:', response.status,
)
response_output = {
'json': await response.json(),
'status_code': response.status,
'index': data[1],
'url': data[0]
}
return response_output
else:
print(f"Failed with status code {response.status}, retrying...")
except aiohttp.ClientError as e:
print(f"Request failed: {e}, attempt {i + 1}/{retries}")
if i < retries - 1:
await asyncio.sleep(2 ** i) # exponential back-off
else:
raise
return None
def send_request(url, data, headers):
retries = 3
for i in range(retries):
try:
response = requests.post(url, headers=headers, json=data)
if response.status_code == 200:
return response
else:
print(f"Failed with status code {response.status_code}, retrying...")
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}, attempt {i + 1}/{retries}")
if i < retries - 1:
time.sleep(2 ** i) # exponential back-off
else:
raise
return None
# Function to scrape a single page using the ProxyScrape API
def scrape_page(url):
print("Scraping and collecting urls from:", url)
data = {
"url": url,
"httpResponseBody": True # Use browserHtml for JavaScript rendering
}
headers = {
"Content-Type": "application/json",
"X-Api-Key": api_key
}
response = send_request(
"https://api.proxyscrape.com/v3/accounts/freebies/scraperapi/request",
data,
headers
)
if response.status_code == 200:
json_response = response.json()
#base64 decode json_response["data"]["httpResponseBody"]
html = base64.b64decode(json_response["data"]["httpResponseBody"]).decode("utf-8")
# Parse HTML content using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Extract URLs from the page (handling relative URLs)
base_url = "https://iproyal.com" # Base URL for iproyal
urls_elements = soup.select(
"a[class=\"group flex items-center gap-4 mt-auto astro-zgb246lv outlined-button\"]"
)
for url_element in urls_elements:
yield handle_relative_url(base_url, url_element["href"])
# Check for "Next" page button (handling relative URL)
try:
next_page_button = soup.find(
"a",
class_="outlined-button w-full text-size-[14px] px-8 sm:px-16 pagination-link",
string="Next",
)
if next_page_button:
yield handle_relative_url(base_url, next_page_button["href"])
except AttributeError:
pass
else:
print("Error:", response.status_code)
# Function to extract data from a specific landing page and save to JSON
def extract_and_save_data(response: dict or None):
if response is None:
print("Error:", response['status_code'])
if response['status_code'] == 200:
html = response['json']["data"]["browserHtml"]
# Parse HTML content using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
meta_title = soup.find("meta", property="og:title")
if meta_title:
meta_title = meta_title["content"]
else:
meta_title = ""
print(f"Meta title: {meta_title}")
meta_description = soup.find("meta", property="og:description")
if meta_description:
meta_description = meta_description["content"]
else:
meta_description = ""
print(f"Meta description: {meta_description}")
brand = soup.find("a", class_="!text-brand-400")
if brand:
brand = brand.text
else:
brand = ""
print(f"Brand: {brand}")
h1 = soup.find("h1", class_="tp-headline-m lg:tp-headline-l lg:max-w-[600px]")
if h1:
h1 = h1.text
else:
h1 = ""
print(f"H1: {h1}")
jumbotron_text = soup.find("div", class_="lg:tp-subheadline")
if jumbotron_text:
jumbotron_text = jumbotron_text.text
else:
jumbotron_text = ""
print(f"Jumbotron text: {jumbotron_text}")
jumbotron_image = soup.find("img", class_="w-full h-full hidden lg:block")
if not jumbotron_image:
jumbotron_image = soup.find("img", class_="w-full h-full object-contain")
if jumbotron_image:
jumbotron_image = jumbotron_image['src']
else:
jumbotron_image = ""
print(f"Jumbotron image: {jumbotron_image}")
most_imports_question_element = soup.find("fieldset",
class_="flex flex-col gap-16 w-full lg:max-w-[522px] flex-shrink-0 astro-jipml36k")
if most_imports_question_element:
most_imports_questions = most_imports_question_element.find_all("label")
most_import_ques = [question.text for question in most_imports_questions]
else:
most_import_ques = []
print(most_import_ques)
most_imports_answer_element = soup.find("div", class_="contents astro-jipml36k")
if most_imports_answer_element:
most_imports_answers = most_imports_answer_element.find_all("section")
most_import_ans = [answer.text.strip() for answer in most_imports_answers]
else:
most_import_ans = []
print(most_import_ans)
faq_question_elements = soup.find_all("summary",
class_="group-[&:not(:first-child)]:pt-8 pb-22 tp-headline-s flex justify-between appearance-none cursor-pointer astro-vqbstbga")
faq_ques = [question.text for question in faq_question_elements]
print(faq_ques)
faq_answer_elements = soup.find_all("div", class_="pb-16 astro-vqbstbga")
faq_ans = [answer.text.strip() for answer in faq_answer_elements]
print(faq_ans)
json_data = {}
json_data['meta_title'] = meta_title
json_data['meta_description'] = meta_description
json_data['url'] = response['url']
json_data['h1'] = h1
json_data['jumbotron_text'] = jumbotron_text
json_data['jumbotron_image'] = jumbotron_image
most_import_json = []
for i in range(len(most_import_ans)):
most_import_json.append({"title": most_import_ques[i], "description": most_import_ans[i]})
faq_json = []
for i in range(len(faq_ans)):
faq_json.append({"question": faq_ques[i], "answer": faq_ans[i]})
json_data['most_import'] = most_import_json
json_data['faq'] = faq_json
data = {}
data[f'page{response["index"]}'] = json_data
prettified_json = json.dumps(data, indent=4)
# Create a directory for JSON files if it doesn't exist
os.makedirs("data", exist_ok=True)
# Save data to a JSON file
with open(f"data/page_{response['index']}.json", "w") as f:
json.dump(data, f, indent=4)
else:
print("Error:", response['status_code'])
async def get_page_contents(properties) -> List[dict or None]:
tasks = [send_request_ASYNC(p) for p in properties]
responses = await asyncio.gather(*tasks)
return responses
if __name__ == '__main__':
# Starting URL
start_url = "https://iproyal.com/other-proxies/"
scraped_urls = [start_url]
pagination_url = start_url
index = 1
while pagination_url != '':
urls_to_scrape_inside_page = []
for extracted_url in scrape_page(pagination_url):
if extracted_url not in scraped_urls:
scraped_urls.append(extracted_url) # WE USE THIS LIST TO MAKE SURE WE DON'T SCRAPE DUPLICATES
# Check if it's a landing page and extract data
if "/other-proxies/" in extracted_url:
if 'iproyal.com/other-proxies/?page=' in extracted_url:
pagination_url = extracted_url
else:
pagination_url = ''
PROXYSCRAPE_webscraping_api_config: dict = {
"url": extracted_url,
"browserHtml": True
}
urls_to_scrape_inside_page.append([PROXYSCRAPE_webscraping_api_config, index])
index += 1
print('Extracting content of each link')
content = asyncio.run(get_page_contents(urls_to_scrape_inside_page))
print("Writing output files")
for c in content:
extract_and_save_data(c)
print("Scraping and data extraction completed!")
```