Apple Product Spec Scraper Python Script

This Python script searches Wikipedia for an Apple product by name, extracts key specifications like dimensions, weight, and camera details from the infobox, and formats the data for easy reading, with special handling for model-specific variations like “Pro” and “Pro Max.”

This script relies on four Python libraries: requests fetches data from Wikipedia’s API, BeautifulSoup (from bs4) parses the HTML content to extract infobox details, sys handles command-line arguments for the product name input, and re provides regular expression support to clean up text by removing citation markers and extra whitespace. To install the external libraries, use pip: run pip install requests beautifulsoup4 in your terminal or command prompt, while sys and re are part of Python’s standard library and require no additional installation.

Click to view script…

import requests
from bs4 import BeautifulSoup
import sys
import re

# Define the fields we want to show
desired_fields = [
    "First released",
    "Dimensions",
    "Weight",
    "Operating system",
    "System-on-chip",
    "Memory",
    "Storage",
    "Battery",
    "Rear camera",
    "Front camera",
    "Display",
]

# Define fields that have model-specific values (e.g., Pro vs. Pro Max)
model_specific_fields = ["Dimensions", "Weight", "Battery", "Display"]

def search_wikipedia(query):
    """Search Wikipedia for the given query and return the title of the first result."""
    search_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": query,
        "format": "json"
    }
    response = requests.get(search_url, params=params)
    data = response.json()
    if data['query']['search']:
        return data['query']['search'][0]['title']
    return None

def get_page_html(title):
    """Retrieve the HTML content of the Wikipedia page with the given title."""
    parse_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json"
    }
    response = requests.get(parse_url, params=params)
    data = response.json()
    return data['parse']['text']['*']

def extract_infobox(html):
    """Extract key-value pairs from the infobox in the HTML content."""
    soup = BeautifulSoup(html, 'html.parser')
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        rows = infobox.find_all('tr')
        info = {}
        for row in rows:
            header = row.find('th')
            data = row.find('td')
            if header and data:
                key = header.text.strip()
                value = data.text.strip()
                info[key] = value
        return info
    return None

def clean_text(text):
    """Remove citation numbers and extra spaces from the text."""
    return re.sub(r'\[\d+\]', '', text).strip()

def format_model_values(value):
    """Format model-specific values for fields like dimensions, weight, etc."""
    if "Pro Max:" in value:
        pro_part, pro_max_part = value.split("Pro Max:", 1)
        pro_value = clean_text(pro_part.replace("Pro: ", "", 1).strip())
        pro_max_value = clean_text(pro_max_part.strip())
        return f"- Pro: {pro_value}\n- Pro Max: {pro_max_value}"
    elif "16 Pro Max:" in value:
        pro_part, pro_max_part = value.split("16 Pro Max:", 1)
        pro_value = clean_text(pro_part.replace("16 Pro: ", "", 1).strip())
        pro_max_value = clean_text(pro_max_part.strip())
        return f"- 16 Pro: {pro_value}\n- 16 Pro Max: {pro_max_value}"
    return clean_text(value)

def main():
    if len(sys.argv) < 2:
        print("Usage: python3 apple_product_finder.py <product name>")
        sys.exit(1)
    product_name = " ".join(sys.argv[1:])
    title = search_wikipedia(product_name + " Apple")
    if not title:
        print("No product found.")
        return
    html = get_page_html(title)
    infobox = extract_infobox(html)
    if infobox:
        print(f"Key information for {title}:")
        for key in desired_fields:
            if key in infobox:
                value = infobox[key]
                if key in model_specific_fields:
                    formatted_value = format_model_values(value)
                    print(f"{key}:\n{formatted_value}")
                else:
                    print(f"{key}: {clean_text(value)}")
    else:
        print("No infobox found for this product.")

if __name__ == "__main__":
    main()

Apple Product Spec Scraper Python Script

More posts

iTunes Top Songs & Albums Purchases Python Script

iOS App Store Comparison Tool Python Script

iOS App Store Rating Checker Python Script

Sleep Pattern Analyzer Python Script