Notebook 06 — MediaWiki Image Upload

Project: Linked Open Exhibition — NFDI4Culture / Hochschule Hannover (BIM-126-02)
AI attribution: GitHub Copilot (Claude Sonnet 4.6)
Depends on: - 03_dnb_cover_images.ipynbcatalogues/images/*.jpg - 05_wikibase_upload.ipynb → items exist in Wikibase - .env file with MW_URL, MW_USER, MW_PASSWORD

Purpose: Upload exhibition cover images to the MediaWiki instance and link each uploaded file back to its Wikibase item via the image property.


Background: mwclient

mwclient is a Python library for interacting with the MediaWiki API. It handles authentication, file uploads, and page edits.

File naming convention: Sprengel_Exhibition_{IDN}.jpg
Each uploaded file gets a description page with source, date fetched, and rights notice.

Cover image rights reminder

Cover images are publisher-supplied and are not CC0. They are uploaded here solely for use within the project Wikibase/MediaWiki instance for educational purposes. Do not redistribute.

import os, json, time
import pandas as pd
import mwclient
from pathlib import Path
from dotenv import load_dotenv
from datetime import date

load_dotenv(Path("../../.env"))

MW_URL  = os.getenv("MW_URL",  "https://wikibase.wbworkshop.tibwiki.io")
MW_USER = os.getenv("MW_USER")
MW_PASS = os.getenv("MW_PASSWORD")

if not MW_USER or not MW_PASS:
    raise EnvironmentError("Set MW_USER and MW_PASSWORD in your .env file.")

# mwclient expects host without scheme
host = MW_URL.replace("https://", "").replace("http://", "").rstrip("/")
scheme = "https" if MW_URL.startswith("https") else "http"

site = mwclient.Site(host, scheme=scheme, path="/w/")
site.login(MW_USER, MW_PASS)
print(f"Logged in to MediaWiki: {MW_URL}")

Step 1 — Load data

CSV_PATH  = Path("../sprengel_exhibitions.csv")
IMAGE_DIR = Path("../images")

df = pd.read_csv(CSV_PATH)
print(f"Records: {len(df)}")

image_files = {p.stem: p for p in IMAGE_DIR.glob("*.jpg")}
print(f"Local images available: {len(image_files)}")

Step 2 — Upload images to MediaWiki

TODAY = date.today().isoformat()
uploaded = 0
skipped  = 0

for _, row in df.iterrows():
    idn = str(row.get("idn", "")).strip()

    if idn not in image_files:
        skipped += 1
        continue

    img_path   = image_files[idn]
    file_name  = f"Sprengel_Exhibition_{idn}.jpg"
    title_text = str(row.get("title", "")).strip()[:200]
    year_text  = str(row.get("year", "")).strip()

    description = (
        f"== Summary ==\n"
        f"Exhibition catalogue cover: {title_text} ({year_text})\n\n"
        f"* '''Source''': DNB catalogue enrichment API (https://portal.dnb.de)\n"
        f"* '''DNB IDN''': {idn}\n"
        f"* '''Date fetched''': {TODAY}\n"
        f"* '''Rights''': Publisher-supplied cover image. Not CC0. "
        f"Uploaded for educational/non-commercial use within this project only. "
        f"Do not redistribute.\n"
    )

    try:
        with open(img_path, "rb") as f:
            site.upload(
                file=f,
                filename=file_name,
                description=description,
                ignore=True,  # overwrite if already exists
                comment=f"Bot: upload cover image for DNB IDN {idn}",
            )
        print(f"  Uploaded: {file_name}")
        uploaded += 1
    except Exception as e:
        print(f"  ERROR for {idn}: {e}")

    time.sleep(0.5)

print(f"\nUploaded: {uploaded} | Skipped (no image): {skipped}")