Gambody ScraperPlunder of knowledge!

I had ChatGPT generate this scraper to scrape the Gambody website because I found a pack on the Internet Archive and wanted to upload it to a couple of select trackers. I am pretty happy how it turned out.

ChatGPTs description:

This script scrapes a Gambody model page by extracting its title, images, tags, and description. It cleans the title (removing unwanted phrases) and creates a main folder (plus an additional dot-separated folder for the files to generate a torrent with an additional script and an images subfolder). It downloads images containing “980x500” (excluding .webp files), saves tags in a text file, and converts the description from HTML to BBCode—removing unwanted sections—before saving it with a clickable “Original Source” link.

Create an environment file named .env.gambody for the imgbb API key:

IMGBB_API_KEY=your_actual_api_key_here

import os
import re
import requests
import base64
from bs4 import BeautifulSoup
 
def load_env(filepath):
    """Load environment variables from a file in KEY=VALUE format."""
    env_vars = {}
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                # Skip blank lines and comments
                if not line or line.startswith('#'):
                    continue
                key, sep, value = line.partition('=')
                if sep:
                    env_vars[key.strip()] = value.strip().strip('"').strip("'")
    except Exception as e:
        print(f"Error loading env file: {e}")
    return env_vars
 
# Load imgbb API key from .env.gambody
env_vars = load_env(".env.gambody")
IMGBB_API_KEY = env_vars.get("IMGBB_API_KEY")
if not IMGBB_API_KEY:
    print("Error: IMGBB_API_KEY not found in .env.gambody")
    exit(1)
 
def html_to_bbcode(html):
    """
    Converts a block of HTML to BBCode.
    This simple converter replaces common tags. You can extend it as needed.
    """
    bb = html
 
    # Replace <br> with newlines
    bb = re.sub(r'<br\s*/?>', '\n', bb, flags=re.IGNORECASE)
 
    # Replace paragraph tags with line breaks
    bb = re.sub(r'<p[^>]*>', '', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</p>', '\n\n', bb, flags=re.IGNORECASE)
 
    # Replace bold tags (<strong> and <b>)
    bb = re.sub(r'<strong[^>]*>', '[b]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</strong>', '[/b]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'<b[^>]*>', '[b]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</b>', '[/b]', bb, flags=re.IGNORECASE)
 
    # Replace italic tags (<em> and <i>)
    bb = re.sub(r'<em[^>]*>', '[i]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</em>', '[/i]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'<i[^>]*>', '[i]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</i>', '[/i]', bb, flags=re.IGNORECASE)
 
    # Replace underline tags
    bb = re.sub(r'<u[^>]*>', '[u]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</u>', '[/u]', bb, flags=re.IGNORECASE)
 
    # Convert anchor tags to BBCode: <a href="...">text</a> -> [url=...]text[/url]
    bb = re.sub(r'<a\s+href="([^"]+)"[^>]*>(.*?)</a>', r'[url=\1]\2[/url]', bb, flags=re.IGNORECASE | re.DOTALL)
 
    # Replace unordered list tags with [list] and list items with [*]
    bb = re.sub(r'<ul[^>]*>', '[list]\n', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</ul>', '\n[/list]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'<ol[^>]*>', '[list=1]\n', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</ol>', '\n[/list]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'<li[^>]*>', '[*]', bb, flags=re.IGNORECASE)
    bb = re.sub(r'</li>', '\n', bb, flags=re.IGNORECASE)
 
    # Remove any other remaining HTML tags
    bb = re.sub(r'<[^>]+>', '', bb)
 
    # Normalize newlines and return
    bb = re.sub(r'\n\s*\n', '\n\n', bb)
    return bb.strip()
 
def remove_phrases(text, phrases):
    """
    Remove all occurrences of any phrase in the list from the given text.
    Matching is case-insensitive.
    """
    for phrase in phrases:
        text = re.sub(re.escape(phrase), '', text, flags=re.IGNORECASE)
    return text.strip()
 
def upload_image_to_imgbb(image_path, api_key):
    """
    Uploads an image at image_path to imgbb using the given API key.
    Returns the URL of the uploaded image if successful, or None otherwise.
    """
    upload_url = "https://api.imgbb.com/1/upload"
    try:
        with open(image_path, "rb") as f:
            image_data = f.read()
        encoded_data = base64.b64encode(image_data)
        payload = {
            "key": api_key,
            "image": encoded_data
        }
        response = requests.post(upload_url, data=payload)
        if response.status_code == 200:
            json_resp = response.json()
            if json_resp.get("success"):
                return json_resp["data"]["url"]
            else:
                print(f"imgbb upload failed: {json_resp.get('error', {}).get('message')}")
        else:
            print(f"imgbb upload HTTP error: {response.status_code}")
    except Exception as e:
        print(f"Exception during imgbb upload: {e}")
    return None
 
def main():
    # Ask the user for a Gambody URL
    url = input("Enter Gambody URL: ").strip()
    if not url:
        print("No URL provided.")
        return
 
    # Fetch the page
    try:
        response = requests.get(url)
    except Exception as e:
        print(f"Error fetching URL: {e}")
        return
 
    if response.status_code != 200:
        print(f"Error: received status code {response.status_code}")
        return
 
    soup = BeautifulSoup(response.text, 'html.parser')
 
    # Extract the title from the <h1> tag
    h1_tag = soup.find('h1')
    h1_text = h1_tag.get_text(strip=True) if h1_tag else "untitled"
    folder_name = h1_text.split('|')[0].strip()
 
    # Define phrases to remove
    phrases_to_remove = [
        "3D Printing Figurine in Diorama",
        "3D Printer Files",
        "3D Printing Figurine",
        "3D Printing Model",
        "for 3D Printing"
    ]
    folder_name = remove_phrases(folder_name, phrases_to_remove)
    if not folder_name:
        folder_name = "untitled"
 
    # Create the main folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
 
    # Create an images subfolder within the main folder
    images_folder = os.path.join(folder_name, "images")
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)
 
    # Create an additional folder with dot-separated title inside the main folder
    dot_folder_name = re.sub(r'\s+', '.', folder_name)
    dot_folder_path = os.path.join(folder_name, dot_folder_name)
    if not os.path.exists(dot_folder_path):
        os.makedirs(dot_folder_path)
    print(f"Created additional folder: {dot_folder_path}")
 
    print(f"Saving files to folder: {folder_name}")
    print(f"Images will be saved to: {images_folder}")
 
    # Download images that include "980x500" in their filename (skipping .webp) into the images folder
    downloaded_images = []
    images = soup.find_all('img')
    for i, img in enumerate(images):
        src = img.get('src')
        if not src:
            continue
        img_url = requests.compat.urljoin(url, src)
        if img_url.lower().endswith('.webp'):
            print(f"Skipping .webp image: {img_url}")
            continue
        if "980x500" not in img_url:
            print(f"Skipping image (does not contain '980x500'): {img_url}")
            continue
        try:
            img_data = requests.get(img_url).content
            filename = os.path.basename(img_url.split('?')[0])
            if not filename:
                filename = f"image_{i}.jpg"
            if not os.path.splitext(filename)[1]:
                filename += ".jpg"
            filepath = os.path.join(images_folder, filename)
            with open(filepath, 'wb') as f:
                f.write(img_data)
            downloaded_images.append(filepath)
            print(f"Downloaded image: {filename}")
        except Exception as e:
            print(f"Error downloading image {img_url}: {e}")
 
    # Upload downloaded images to imgbb and collect their URLs
    imgbb_urls = []
    for image_path in downloaded_images:
        print(f"Uploading {image_path} to imgbb...")
        uploaded_url = upload_image_to_imgbb(image_path, IMGBB_API_KEY)
        if uploaded_url:
            imgbb_urls.append(uploaded_url)
            print(f"Uploaded to imgbb: {uploaded_url}")
        else:
            print(f"Failed to upload {image_path}")
 
    # Create tags.txt from the <p class="cloudTags"> element
    tag_paragraph = soup.find('p', class_='cloudTags')
    tags = []
    if tag_paragraph:
        tag_links = tag_paragraph.find_all('a')
        if tag_links:
            tags = [a.get_text(strip=True) for a in tag_links]
        else:
            tags = tag_paragraph.get_text(strip=True).split()
    tags_file = os.path.join(folder_name, "tags.txt")
    with open(tags_file, 'w', encoding='utf-8') as f:
        f.write(", ".join(tags))
    print(f"Saved tags to {tags_file}")
 
    # Process description
    description_div = soup.find('div', class_='tab-content', id='description-block')
    if description_div:
        for p in description_div.find_all('p'):
            if "you can get the figurine" in p.get_text().lower():
                p.decompose()
        description_html = str(description_div)
        faq_index = description_html.find("FAQ:")
        if faq_index != -1:
            description_html = description_html[:faq_index]
    else:
        description_html = "No description available."
    for phrase in phrases_to_remove:
        description_html = re.sub(re.escape(phrase), '', description_html, flags=re.IGNORECASE)
    bbcode_description = html_to_bbcode(description_html)
    bbcode_description = re.sub(r'\n?\s*_{3,}\s*\n?', '\n', bbcode_description)
 
    # Prepare header with the title in size 32, centered
    header = f"[center][size=32][b]{folder_name}[/b][/size][/center]\nOriginal Source: [url={url}]{url}[/url]\n\n"
    
    # Build the images header (centered) outside the spoiler
    images_header = "[center]---------------------- [size=22]Images[/size] ----------------------[/center]\n"
    # Build image BBCode tags for each imgbb URL
    img_bbcode = "\n".join(f"[img]{img_url}[/img]" for img_url in imgbb_urls)
    # Create the spoiler block for the images
    spoiler_block = f"[spoiler]\n{img_bbcode}\n[/spoiler]\n"
    
    # Append images header (outside spoiler) and spoiler block to description
    description_content = header + bbcode_description + "\n" + images_header + spoiler_block
 
    description_file = os.path.join(folder_name, "description.txt")
    with open(description_file, 'w', encoding='utf-8') as f:
        f.write(description_content)
    print(f"Saved description to {description_file}")
 
if __name__ == "__main__":
    main()

Aye Matey!

Explorer

Gambody Scraper

Graph View