I had ChatGPT generate this scraper to scrape the Gambody website because I found a pack on the Internet Archive and wanted to upload it to a couple of select trackers. I am pretty happy how it turned out.
ChatGPTs description:
This script scrapes a Gambody model page by extracting its title, images, tags, and description. It cleans the title (removing unwanted phrases) and creates a main folder (plus an additional dot-separated folder for the files to generate a torrent with an additional script and an images subfolder). It downloads images containing “980x500” (excluding .webp files), saves tags in a text file, and converts the description from HTML to BBCode—removing unwanted sections—before saving it with a clickable “Original Source” link.
Create an environment file named .env.gambody for the imgbb API key:
IMGBB_API_KEY=your_actual_api_key_hereimport os
import re
import requests
import base64
from bs4 import BeautifulSoup
def load_env(filepath):
"""Load environment variables from a file in KEY=VALUE format."""
env_vars = {}
try:
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
# Skip blank lines and comments
if not line or line.startswith('#'):
continue
key, sep, value = line.partition('=')
if sep:
env_vars[key.strip()] = value.strip().strip('"').strip("'")
except Exception as e:
print(f"Error loading env file: {e}")
return env_vars
# Load imgbb API key from .env.gambody
env_vars = load_env(".env.gambody")
IMGBB_API_KEY = env_vars.get("IMGBB_API_KEY")
if not IMGBB_API_KEY:
print("Error: IMGBB_API_KEY not found in .env.gambody")
exit(1)
def html_to_bbcode(html):
"""
Converts a block of HTML to BBCode.
This simple converter replaces common tags. You can extend it as needed.
"""
bb = html
# Replace <br> with newlines
bb = re.sub(r'<br\s*/?>', '\n', bb, flags=re.IGNORECASE)
# Replace paragraph tags with line breaks
bb = re.sub(r'<p[^>]*>', '', bb, flags=re.IGNORECASE)
bb = re.sub(r'</p>', '\n\n', bb, flags=re.IGNORECASE)
# Replace bold tags (<strong> and <b>)
bb = re.sub(r'<strong[^>]*>', '[b]', bb, flags=re.IGNORECASE)
bb = re.sub(r'</strong>', '[/b]', bb, flags=re.IGNORECASE)
bb = re.sub(r'<b[^>]*>', '[b]', bb, flags=re.IGNORECASE)
bb = re.sub(r'</b>', '[/b]', bb, flags=re.IGNORECASE)
# Replace italic tags (<em> and <i>)
bb = re.sub(r'<em[^>]*>', '[i]', bb, flags=re.IGNORECASE)
bb = re.sub(r'</em>', '[/i]', bb, flags=re.IGNORECASE)
bb = re.sub(r'<i[^>]*>', '[i]', bb, flags=re.IGNORECASE)
bb = re.sub(r'</i>', '[/i]', bb, flags=re.IGNORECASE)
# Replace underline tags
bb = re.sub(r'<u[^>]*>', '[u]', bb, flags=re.IGNORECASE)
bb = re.sub(r'</u>', '[/u]', bb, flags=re.IGNORECASE)
# Convert anchor tags to BBCode: <a href="...">text</a> -> [url=...]text[/url]
bb = re.sub(r'<a\s+href="([^"]+)"[^>]*>(.*?)</a>', r'[url=\1]\2[/url]', bb, flags=re.IGNORECASE | re.DOTALL)
# Replace unordered list tags with [list] and list items with [*]
bb = re.sub(r'<ul[^>]*>', '[list]\n', bb, flags=re.IGNORECASE)
bb = re.sub(r'</ul>', '\n[/list]', bb, flags=re.IGNORECASE)
bb = re.sub(r'<ol[^>]*>', '[list=1]\n', bb, flags=re.IGNORECASE)
bb = re.sub(r'</ol>', '\n[/list]', bb, flags=re.IGNORECASE)
bb = re.sub(r'<li[^>]*>', '[*]', bb, flags=re.IGNORECASE)
bb = re.sub(r'</li>', '\n', bb, flags=re.IGNORECASE)
# Remove any other remaining HTML tags
bb = re.sub(r'<[^>]+>', '', bb)
# Normalize newlines and return
bb = re.sub(r'\n\s*\n', '\n\n', bb)
return bb.strip()
def remove_phrases(text, phrases):
"""
Remove all occurrences of any phrase in the list from the given text.
Matching is case-insensitive.
"""
for phrase in phrases:
text = re.sub(re.escape(phrase), '', text, flags=re.IGNORECASE)
return text.strip()
def upload_image_to_imgbb(image_path, api_key):
"""
Uploads an image at image_path to imgbb using the given API key.
Returns the URL of the uploaded image if successful, or None otherwise.
"""
upload_url = "https://api.imgbb.com/1/upload"
try:
with open(image_path, "rb") as f:
image_data = f.read()
encoded_data = base64.b64encode(image_data)
payload = {
"key": api_key,
"image": encoded_data
}
response = requests.post(upload_url, data=payload)
if response.status_code == 200:
json_resp = response.json()
if json_resp.get("success"):
return json_resp["data"]["url"]
else:
print(f"imgbb upload failed: {json_resp.get('error', {}).get('message')}")
else:
print(f"imgbb upload HTTP error: {response.status_code}")
except Exception as e:
print(f"Exception during imgbb upload: {e}")
return None
def main():
# Ask the user for a Gambody URL
url = input("Enter Gambody URL: ").strip()
if not url:
print("No URL provided.")
return
# Fetch the page
try:
response = requests.get(url)
except Exception as e:
print(f"Error fetching URL: {e}")
return
if response.status_code != 200:
print(f"Error: received status code {response.status_code}")
return
soup = BeautifulSoup(response.text, 'html.parser')
# Extract the title from the <h1> tag
h1_tag = soup.find('h1')
h1_text = h1_tag.get_text(strip=True) if h1_tag else "untitled"
folder_name = h1_text.split('|')[0].strip()
# Define phrases to remove
phrases_to_remove = [
"3D Printing Figurine in Diorama",
"3D Printer Files",
"3D Printing Figurine",
"3D Printing Model",
"for 3D Printing"
]
folder_name = remove_phrases(folder_name, phrases_to_remove)
if not folder_name:
folder_name = "untitled"
# Create the main folder if it doesn't exist
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# Create an images subfolder within the main folder
images_folder = os.path.join(folder_name, "images")
if not os.path.exists(images_folder):
os.makedirs(images_folder)
# Create an additional folder with dot-separated title inside the main folder
dot_folder_name = re.sub(r'\s+', '.', folder_name)
dot_folder_path = os.path.join(folder_name, dot_folder_name)
if not os.path.exists(dot_folder_path):
os.makedirs(dot_folder_path)
print(f"Created additional folder: {dot_folder_path}")
print(f"Saving files to folder: {folder_name}")
print(f"Images will be saved to: {images_folder}")
# Download images that include "980x500" in their filename (skipping .webp) into the images folder
downloaded_images = []
images = soup.find_all('img')
for i, img in enumerate(images):
src = img.get('src')
if not src:
continue
img_url = requests.compat.urljoin(url, src)
if img_url.lower().endswith('.webp'):
print(f"Skipping .webp image: {img_url}")
continue
if "980x500" not in img_url:
print(f"Skipping image (does not contain '980x500'): {img_url}")
continue
try:
img_data = requests.get(img_url).content
filename = os.path.basename(img_url.split('?')[0])
if not filename:
filename = f"image_{i}.jpg"
if not os.path.splitext(filename)[1]:
filename += ".jpg"
filepath = os.path.join(images_folder, filename)
with open(filepath, 'wb') as f:
f.write(img_data)
downloaded_images.append(filepath)
print(f"Downloaded image: {filename}")
except Exception as e:
print(f"Error downloading image {img_url}: {e}")
# Upload downloaded images to imgbb and collect their URLs
imgbb_urls = []
for image_path in downloaded_images:
print(f"Uploading {image_path} to imgbb...")
uploaded_url = upload_image_to_imgbb(image_path, IMGBB_API_KEY)
if uploaded_url:
imgbb_urls.append(uploaded_url)
print(f"Uploaded to imgbb: {uploaded_url}")
else:
print(f"Failed to upload {image_path}")
# Create tags.txt from the <p class="cloudTags"> element
tag_paragraph = soup.find('p', class_='cloudTags')
tags = []
if tag_paragraph:
tag_links = tag_paragraph.find_all('a')
if tag_links:
tags = [a.get_text(strip=True) for a in tag_links]
else:
tags = tag_paragraph.get_text(strip=True).split()
tags_file = os.path.join(folder_name, "tags.txt")
with open(tags_file, 'w', encoding='utf-8') as f:
f.write(", ".join(tags))
print(f"Saved tags to {tags_file}")
# Process description
description_div = soup.find('div', class_='tab-content', id='description-block')
if description_div:
for p in description_div.find_all('p'):
if "you can get the figurine" in p.get_text().lower():
p.decompose()
description_html = str(description_div)
faq_index = description_html.find("FAQ:")
if faq_index != -1:
description_html = description_html[:faq_index]
else:
description_html = "No description available."
for phrase in phrases_to_remove:
description_html = re.sub(re.escape(phrase), '', description_html, flags=re.IGNORECASE)
bbcode_description = html_to_bbcode(description_html)
bbcode_description = re.sub(r'\n?\s*_{3,}\s*\n?', '\n', bbcode_description)
# Prepare header with the title in size 32, centered
header = f"[center][size=32][b]{folder_name}[/b][/size][/center]\nOriginal Source: [url={url}]{url}[/url]\n\n"
# Build the images header (centered) outside the spoiler
images_header = "[center]---------------------- [size=22]Images[/size] ----------------------[/center]\n"
# Build image BBCode tags for each imgbb URL
img_bbcode = "\n".join(f"[img]{img_url}[/img]" for img_url in imgbb_urls)
# Create the spoiler block for the images
spoiler_block = f"[spoiler]\n{img_bbcode}\n[/spoiler]\n"
# Append images header (outside spoiler) and spoiler block to description
description_content = header + bbcode_description + "\n" + images_header + spoiler_block
description_file = os.path.join(folder_name, "description.txt")
with open(description_file, 'w', encoding='utf-8') as f:
f.write(description_content)
print(f"Saved description to {description_file}")
if __name__ == "__main__":
main()