Immich × Databricks
Ingest your Immich photo library via the REST API into a Bronze / Silver / Gold medallion lakehouse — with incremental updates, SCD deletion tracking, scheduled jobs, dashboards, and an interactive map.
Values like this ✏ need to match your setup. Hover for guidance.
00 — Bootstrap secrets (run once, then delete)
Creates the Databricks secret scope and stores your Immich credentials. Uses the Databricks REST API directly — no CLI needed. Fill in the four highlighted values, run once, then delete the cell immediately.
import requests # ── Fill these in, run once, then DELETE this cell ──────────────────── DATABRICKS_PAT = "dapi..." WORKSPACE_URL = "https://adb-xxxxxxxx.azuredatabricks.net" IMMICH_BASE_URL = "https://your-immich-url.com" IMMICH_API_KEY = "your_immich_api_key_here" # ───────────────────────────────────────────────────────────────────── headers = {"Authorization": f"Bearer {DATABRICKS_PAT}", "Content-Type": "application/json"} base = f"{WORKSPACE_URL}/api/2.0/secrets" def create_scope(name): r = requests.post(f"{base}/scopes/create", headers=headers, json={"scope": name, "initial_manage_principal": "users"}) if r.status_code == 200: print(f"Scope '{name}' created.") elif "RESOURCE_ALREADY_EXISTS" in r.text: print(f"Scope '{name}' already exists.") else: raise Exception(r.text) def put_secret(scope, key, value): r = requests.post(f"{base}/put", headers=headers, json={"scope": scope, "key": key, "string_value": value}) if r.status_code == 200: print(f" secret '{key}' stored.") else: raise Exception(r.text) create_scope("immich") put_secret("immich", "api_key", IMMICH_API_KEY) put_secret("immich", "base_url", IMMICH_BASE_URL) print("\nDone. Now DELETE this cell.")
01 — Workspace setup & secrets
Run at the start of every session. Sets CATALOG, creates schemas, imports all required libraries, and verifies Immich connectivity.
import requests, json
from datetime import datetime, timezone, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
col, from_json, to_timestamp, year, month,
dayofmonth, lower, current_timestamp, lit, when, count
)
from pyspark.sql.types import (
StructType, StructField, StringType, LongType,
BooleanType, DoubleType, IntegerType, TimestampType
)
spark = SparkSession.builder.getOrCreate()
CATALOG = spark.sql("SELECT current_catalog()").collect()[0][0]
print(f"Catalog : {CATALOG}")
for schema in ["immich_bronze", "immich_silver", "immich_gold"]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{schema}")
print(f" schema ready: {CATALOG}.{schema}")
API_KEY = dbutils.secrets.get(scope="immich", key="api_key")
BASE_URL = dbutils.secrets.get(scope="immich", key="base_url").rstrip("/")
HEADERS = {"x-api-key": API_KEY, "Accept": "application/json"}
resp = requests.get(f"{BASE_URL}/api/server/statistics", headers=HEADERS)
resp.raise_for_status()
stats = resp.json()
print(f"\nImmich connection OK")
print(f" Photos : {stats.get('photos','?')}")
print(f" Videos : {stats.get('videos','?')}")
02 — Reset Drop and recreate all layers
Only run when you want a completely clean slate. Drops every table including deletion history. Skip on normal runs.
for schema in ["immich_bronze", "immich_silver", "immich_gold"]:
full = f"{CATALOG}.{schema}"
tables = spark.sql(f"SHOW TABLES IN {full}").collect()
if tables:
for row in tables:
spark.sql(f"DROP TABLE IF EXISTS {full}.{row['tableName']}")
print(f" dropped: {full}.{row['tableName']}")
else:
print(f" {full}: already empty")
print("\nAll layers cleared.")
03 — Explore the raw API response
Fetch one asset to inspect the exact structure your Immich version returns. withExif: True is required — without it the search endpoint omits exifInfo.
resp = requests.post(f"{BASE_URL}/api/search/metadata", headers=HEADERS,
json={"page": 1, "size": 1, "withExif": True})
resp.raise_for_status()
data = resp.json()
assets_block = data.get("assets", {})
envelope = {k: v for k, v in assets_block.items() if k != "items"}
print("=== Pagination envelope ===")
print(json.dumps(envelope, indent=2))
print("\n=== Full structure of one asset ===")
print(json.dumps(assets_block["items"][0], indent=2))
04 — Analyse field coverage
Sample 100 assets to see which fields are populated in your library. Useful before building gold queries that depend on sparse fields like city or rating.
resp = requests.post(f"{BASE_URL}/api/search/metadata", headers=HEADERS,
json={"page": 1, "size": 100, "withExif": True})
sample = resp.json().get("assets", {}).get("items", [])
n = len(sample)
top_counts, exif_counts = {}, {}
for asset in sample:
for k, v in asset.items():
if v is not None and v != "" and v != [] and v != "0:00:00.00000":
top_counts[k] = top_counts.get(k, 0) + 1
for k, v in (asset.get("exifInfo") or {}).items():
if v is not None and v != "":
exif_counts[k] = exif_counts.get(k, 0) + 1
print(f"Sample: {n} assets\n")
print("=== Top-level fields ===")
for k, v in sorted(top_counts.items(), key=lambda x: -x[1]):
bar = "█" * int(v / n * 30)
print(f" {k:<28} {bar:<30} {v}/{n}")
print("\n=== exifInfo fields ===")
for k, v in sorted(exif_counts.items(), key=lambda x: -x[1]):
bar = "█" * int(v / n * 30)
print(f" {k:<28} {bar:<30} {v}/{n}")
05 — Bronze Full refresh ingestion
Fetches every asset from Immich and merges into bronze using Delta's MERGE. New assets are inserted, changed assets updated in place (Type 1), and assets no longer in Immich are flagged as deleted with a timestamp (Type 2 SCD). Run this weekly.
What bronze stores
- Raw JSON per asset
- Ingestion timestamp
- is_deleted flag
- deleted_at timestamp
SCD approach
- Deletions: Type 2 (flag)
- Metadata changes: Type 1
- Re-imports: new UUID = new row
Tables created
- immich_bronze.assets_raw
- immich_bronze.albums_raw
def fetch_all_assets(base_url, headers, page_size=500):
"""
Fetch all assets with EXIF inline.
withExif=True is required — without it exifInfo is omitted.
Stops when a page returns fewer items than page_size.
"""
all_assets, page = [], 1
while True:
resp = requests.post(f"{base_url}/api/search/metadata", headers=headers,
json={"page": page, "size": page_size, "withExif": True})
resp.raise_for_status()
items = resp.json().get("assets", {}).get("items", [])
all_assets.extend(items)
print(f" Page {page}: +{len(items)} (total: {len(all_assets)})")
if len(items) < page_size:
break
page += 1
print(f"\nFetch complete. Total: {len(all_assets)}")
return all_assets
print("Fetching all assets from Immich...")
raw_assets = fetch_all_assets(BASE_URL, HEADERS)
raw_schema = StructType([StructField("raw_json", StringType(), True)])
id_schema = StructType([StructField("id", StringType())])
BRONZE_ASSETS = f"{CATALOG}.immich_bronze.assets_raw"
incoming_ids = {a["id"] for a in raw_assets}
df_incoming = (
spark.createDataFrame([(json.dumps(a),) for a in raw_assets], raw_schema)
.withColumn("_p", from_json(col("raw_json"), id_schema))
.withColumn("asset_id", col("_p.id"))
.withColumn("ingested_at", current_timestamp())
.withColumn("source", lit("immich_api"))
.withColumn("is_deleted", lit(False))
.withColumn("deleted_at", lit(None).cast(TimestampType()))
.drop("_p")
)
table_exists = spark.catalog.tableExists(BRONZE_ASSETS)
if not table_exists:
print("First run — creating bronze table...")
(df_incoming.write.format("delta").mode("overwrite").saveAsTable(BRONZE_ASSETS))
print(f" Wrote {df_incoming.count()} rows")
else:
df_incoming.createOrReplaceTempView("incoming_assets")
# Upsert: insert new, update changed
spark.sql(f"""
MERGE INTO {BRONZE_ASSETS} AS target
USING incoming_assets AS source
ON target.asset_id = source.asset_id
WHEN MATCHED THEN UPDATE SET
target.raw_json = source.raw_json,
target.ingested_at = source.ingested_at,
target.is_deleted = false,
target.deleted_at = NULL
WHEN NOT MATCHED THEN INSERT *
""")
# Flag assets no longer in Immich as deleted
ids_list = ", ".join([f"'{i}'" for i in incoming_ids])
spark.sql(f"""
UPDATE {BRONZE_ASSETS}
SET is_deleted = true, deleted_at = current_timestamp()
WHERE is_deleted = false
AND asset_id NOT IN ({ids_list})
""")
df_b = spark.table(BRONZE_ASSETS)
total = df_b.count()
active = df_b.where(col("is_deleted") == False).count()
deleted = total - active
print(f" Total rows : {total}")
print(f" Active : {active}")
print(f" Deleted : {deleted}")
BRONZE_ALBUMS = f"{CATALOG}.immich_bronze.albums_raw"
print("Fetching albums...")
resp = requests.get(f"{BASE_URL}/api/albums", headers=HEADERS)
resp.raise_for_status()
raw_albums = resp.json()
print(f" Found {len(raw_albums)} albums")
df_albums = (
spark.createDataFrame([(json.dumps(a),) for a in raw_albums], raw_schema)
.withColumn("ingested_at", current_timestamp())
.withColumn("source", lit("immich_api"))
)
(df_albums.write.format("delta").mode("overwrite").saveAsTable(BRONZE_ALBUMS))
print(f" Wrote {df_albums.count()} rows to bronze.albums_raw")
05b — Bronze Incremental refresh
Only fetches assets modified since the last ingestion. Much faster than a full refresh — use this daily or on-demand after adding new photos. Does not catch deletions; run the full refresh weekly to handle those.
BRONZE_ASSETS = f"{CATALOG}.immich_bronze.assets_raw"
last_run = spark.sql(f"""
SELECT MAX(ingested_at) AS ts FROM {BRONZE_ASSETS} WHERE is_deleted = false
""").collect()[0]["ts"]
if last_run is None:
print("No bronze data found — run the full refresh (cell 5) first.")
else:
since = (last_run - timedelta(hours=1)).strftime("%Y-%m-%dT%H:%M:%S.000Z")
print(f"Fetching assets updated since: {since}")
updated, page, page_size = [], 1, 500
while True:
resp = requests.post(f"{BASE_URL}/api/search/metadata", headers=HEADERS,
json={"page": page, "size": page_size,
"withExif": True, "updatedAfter": since})
resp.raise_for_status()
items = resp.json().get("assets", {}).get("items", [])
updated.extend(items)
print(f" Page {page}: +{len(items)} (total: {len(updated)})")
if len(items) < page_size:
break
page += 1
print(f"\n{len(updated)} new/updated assets found")
if updated:
df_new = (
spark.createDataFrame([(json.dumps(a),) for a in updated], raw_schema)
.withColumn("_p", from_json(col("raw_json"), StructType([StructField("id", StringType())])))
.withColumn("asset_id", col("_p.id"))
.withColumn("ingested_at", current_timestamp())
.withColumn("source", lit("immich_api"))
.withColumn("is_deleted", lit(False))
.withColumn("deleted_at", lit(None).cast(TimestampType()))
.drop("_p")
)
df_new.createOrReplaceTempView("incoming_assets")
spark.sql(f"""
MERGE INTO {BRONZE_ASSETS} AS target
USING incoming_assets AS source
ON target.asset_id = source.asset_id
WHEN MATCHED THEN UPDATE SET
target.raw_json = source.raw_json,
target.ingested_at = source.ingested_at,
target.is_deleted = false,
target.deleted_at = NULL
WHEN NOT MATCHED THEN INSERT *
""")
print(" Merged into bronze.")
else:
print(" Nothing to update.")
06 — Inspect the bronze layer
Verify what is in bronze including SCD columns. Deleted assets remain in the table with is_deleted = true — this is intentional.
df = spark.table(f"{CATALOG}.immich_bronze.assets_raw")
print("=== bronze.assets_raw schema ===")
df.printSchema()
total = df.count()
active = df.where(col("is_deleted") == False).count()
deleted = df.where(col("is_deleted") == True).count()
print(f"\n Total rows : {total}")
print(f" Active : {active}")
print(f" Deleted : {deleted}")
if deleted > 0:
print("\n=== Recently deleted assets ===")
(df.where(col("is_deleted") == True)
.select("asset_id", "deleted_at", "ingested_at")
.orderBy(col("deleted_at").desc())
.limit(5).show(truncate=False))
sample = df.where(col("is_deleted") == False).limit(1).collect()[0]["raw_json"]
print("\n=== Sample raw_json (first 400 chars) ===")
print(sample[:400] + "...")
07 — Silver Parse & clean
Reads from bronze (active rows only), parses raw JSON into typed columns, flattens exifInfo, and adds derived date fields. Run after every bronze refresh.
exif_schema = StructType([
StructField("make", StringType()),
StructField("model", StringType()),
StructField("lensModel", StringType()),
StructField("fNumber", DoubleType()),
StructField("focalLength", DoubleType()),
StructField("iso", IntegerType()),
StructField("exposureTime", StringType()),
StructField("latitude", DoubleType()),
StructField("longitude", DoubleType()),
StructField("city", StringType()),
StructField("state", StringType()),
StructField("country", StringType()),
StructField("description", StringType()),
StructField("timeZone", StringType()),
StructField("dateTimeOriginal", StringType()),
StructField("rating", IntegerType()),
StructField("exifImageWidth", IntegerType()),
StructField("exifImageHeight", IntegerType()),
StructField("fileSizeInByte", LongType()),
])
asset_schema = StructType([
StructField("id", StringType()),
StructField("originalFileName", StringType()),
StructField("type", StringType()),
StructField("fileCreatedAt", StringType()),
StructField("localDateTime", StringType()),
StructField("isFavorite", BooleanType()),
StructField("isArchived", BooleanType()),
StructField("isTrashed", BooleanType()),
StructField("visibility", StringType()),
StructField("duration", StringType()),
StructField("exifInfo", exif_schema),
])
print("Parsing bronze assets to silver...")
df_silver = (
spark.table(f"{CATALOG}.immich_bronze.assets_raw")
.where(col("is_deleted") == False) # exclude SCD-deleted assets
.withColumn("data", from_json(col("raw_json"), asset_schema))
.select(
col("data.id").alias("asset_id"),
col("data.originalFileName").alias("file_name"),
lower(col("data.type")).alias("media_type"),
to_timestamp(col("data.fileCreatedAt")).alias("created_at"),
to_timestamp(col("data.localDateTime")).alias("local_datetime"),
year(to_timestamp(col("data.localDateTime"))).alias("year"),
month(to_timestamp(col("data.localDateTime"))).alias("month"),
dayofmonth(to_timestamp(col("data.localDateTime"))).alias("day"),
col("data.isFavorite").alias("is_favourite"),
col("data.isArchived").alias("is_archived"),
col("data.isTrashed").alias("is_trashed"),
col("data.visibility").alias("visibility"),
col("data.duration").alias("video_duration"),
col("data.exifInfo.make").alias("camera_make"),
col("data.exifInfo.model").alias("camera_model"),
col("data.exifInfo.lensModel").alias("lens_model"),
col("data.exifInfo.fNumber").alias("f_number"),
col("data.exifInfo.iso").alias("iso"),
col("data.exifInfo.latitude").alias("latitude"),
col("data.exifInfo.longitude").alias("longitude"),
col("data.exifInfo.city").alias("city"),
col("data.exifInfo.country").alias("country"),
col("data.exifInfo.fileSizeInByte").alias("file_size_bytes"),
col("data.exifInfo.exifImageWidth").alias("width_px"),
col("data.exifInfo.exifImageHeight").alias("height_px"),
col("data.exifInfo.timeZone").alias("timezone"),
col("data.exifInfo.rating").alias("rating"),
col("ingested_at")
)
.dropDuplicates(["asset_id"])
.where(col("asset_id").isNotNull())
)
(df_silver.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.immich_silver.assets"))
total = df_silver.count()
images = df_silver.where(col("media_type") == "image").count()
videos = df_silver.where(col("media_type") == "video").count()
with_gps = df_silver.where(col("latitude").isNotNull()).count()
rated = df_silver.where(col("rating").isNotNull()).count()
print(f" Total : {total}")
print(f" Images : {images}")
print(f" Videos : {videos}")
print(f" GPS : {with_gps} ({round(with_gps/total*100)}%)")
print(f" Rated : {rated} ({round(rated/total*100)}%)")
print(f"\nWritten to: {CATALOG}.immich_silver.assets")
album_schema = StructType([
StructField("id", StringType()),
StructField("albumName", StringType()),
StructField("description", StringType()),
StructField("assetCount", IntegerType()),
StructField("startDate", StringType()),
StructField("endDate", StringType()),
StructField("shared", BooleanType()),
StructField("hasSharedLink", BooleanType()),
])
df_albums = (
spark.table(f"{CATALOG}.immich_bronze.albums_raw")
.withColumn("data", from_json(col("raw_json"), album_schema))
.select(
col("data.id").alias("album_id"),
col("data.albumName").alias("album_name"),
col("data.description").alias("description"),
col("data.assetCount").alias("asset_count"),
to_timestamp(col("data.startDate")).alias("start_date"),
to_timestamp(col("data.endDate")).alias("end_date"),
col("data.shared").alias("is_shared"),
col("data.hasSharedLink").alias("has_shared_link"),
col("ingested_at")
)
.dropDuplicates(["album_id"])
)
(df_albums.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.immich_silver.albums"))
print(f" Albums : {df_albums.count()}")
print(f" Shared : {df_albums.where(col('is_shared') == True).count()}")
08 — Inspect the silver layer
df = spark.table(f"{CATALOG}.immich_silver.assets")
print("=== silver.assets schema ===")
df.printSchema()
print("=== 5 sample rows ===")
(df.select("asset_id", "file_name", "media_type", "year", "month",
"camera_make", "camera_model", "city", "country",
"file_size_bytes", "rating")
.limit(5).show(truncate=35))
print("=== Null counts in key columns ===")
cols_to_check = ["camera_make","camera_model","city","country",
"latitude","longitude","year","file_size_bytes","rating","timezone"]
null_counts = df.select([
count(when(col(c).isNull(), c)).alias(c) for c in cols_to_check
]).collect()[0].asDict()
total = df.count()
print(f" (out of {total} rows)")
for c, n in null_counts.items():
pct = round(n / total * 100)
print(f" {c:<20} {n:>6} nulls ({pct}%)")
09 — Gold Analytics tables
Purpose-built aggregations ready for dashboards. Note: lower rating = better (1 = first place, 5 = last) — queries sort accordingly.
${CATALOG} automatically. For Python cells replace with {CATALOG} in an f-string.CREATE OR REPLACE TABLE ${CATALOG}.immich_gold.photos_by_month AS SELECT year, month, media_type, COUNT(*) AS asset_count, COUNT(*) FILTER (WHERE is_favourite = true) AS favourite_count, ROUND(SUM(file_size_bytes) / 1e9, 2) AS total_size_gb, COUNT(DISTINCT camera_make) AS distinct_cameras FROM ${CATALOG}.immich_silver.assets WHERE is_trashed = false GROUP BY year, month, media_type ORDER BY year, month;
CREATE OR REPLACE TABLE ${CATALOG}.immich_gold.geography AS SELECT COALESCE(country, 'No GPS data') AS country, city, COUNT(*) AS photo_count, COUNT(DISTINCT camera_model) AS cameras_used, MIN(local_datetime) AS earliest_photo, MAX(local_datetime) AS latest_photo, AVG(latitude) AS avg_lat, AVG(longitude) AS avg_lon FROM ${CATALOG}.immich_silver.assets WHERE is_trashed = false GROUP BY country, city ORDER BY photo_count DESC;
CREATE OR REPLACE TABLE ${CATALOG}.immich_gold.camera_summary AS SELECT COALESCE(camera_make, 'Unknown') AS make, COALESCE(camera_model, 'Unknown') AS model, COUNT(*) AS photo_count, ROUND(AVG(f_number), 1) AS avg_f_number, ROUND(AVG(iso), 0) AS avg_iso, MIN(year) AS first_used_year, MAX(year) AS last_used_year FROM ${CATALOG}.immich_silver.assets WHERE media_type = 'image' AND is_trashed = false GROUP BY camera_make, camera_model ORDER BY photo_count DESC;
CREATE OR REPLACE TABLE ${CATALOG}.immich_gold.ratings_summary AS SELECT rating, COUNT(*) AS photo_count, COUNT(DISTINCT country) AS countries, COUNT(DISTINCT camera_model) AS cameras, MIN(year) AS earliest_year, MAX(year) AS latest_year FROM ${CATALOG}.immich_silver.assets WHERE is_trashed = false AND rating IS NOT NULL GROUP BY rating ORDER BY rating ASC; -- 1 = best / first place
CREATE OR REPLACE TABLE ${CATALOG}.immich_gold.album_stats AS SELECT album_name, asset_count, is_shared, has_shared_link, start_date, end_date, DATEDIFF(end_date, start_date) AS date_span_days FROM ${CATALOG}.immich_silver.albums ORDER BY asset_count DESC;
10 — Explore your data
SELECT year, SUM(asset_count) AS total FROM ${CATALOG}.immich_gold.photos_by_month WHERE media_type = 'image' GROUP BY year ORDER BY year;
SELECT country, SUM(photo_count) AS photos FROM ${CATALOG}.immich_gold.geography WHERE country != 'No GPS data' GROUP BY country ORDER BY photos DESC LIMIT 20;
SELECT city, country, SUM(photo_count) AS photos FROM ${CATALOG}.immich_gold.geography WHERE city IS NOT NULL AND country != 'No GPS data' GROUP BY city, country ORDER BY photos DESC LIMIT 30;
SELECT country, city, COUNT(*) FILTER (WHERE rating = 1) AS first_place, COUNT(*) FILTER (WHERE rating = 2) AS second_place, COUNT(*) FILTER (WHERE rating = 3) AS third_place, COUNT(*) FILTER (WHERE rating IS NOT NULL) AS total_rated FROM ${CATALOG}.immich_silver.assets WHERE is_trashed = false AND country IS NOT NULL GROUP BY country, city ORDER BY first_place DESC LIMIT 20;
import matplotlib.pyplot as plt
df = spark.sql(f"""
SELECT year, SUM(asset_count) AS photos
FROM {CATALOG}.immich_gold.photos_by_month
WHERE media_type = 'image' AND year IS NOT NULL
GROUP BY year ORDER BY year
""").toPandas()
fig, ax = plt.subplots(figsize=(12, 4))
ax.bar(df["year"], df["photos"], color="#1D9E75", width=0.7)
ax.set_xlabel("Year"); ax.set_ylabel("Photos"); ax.set_title("Photos per year")
for _, row in df.iterrows():
ax.text(row["year"], row["photos"] + 20, str(row["photos"]), ha="center", fontsize=9)
plt.tight_layout(); display(fig)
df = spark.sql(f"""
SELECT country, SUM(photo_count) AS photos
FROM {CATALOG}.immich_gold.geography
WHERE country != 'No GPS data'
GROUP BY country ORDER BY photos DESC LIMIT 15
""").toPandas()
fig, ax = plt.subplots(figsize=(10, 5))
ax.barh(df["country"][::-1], df["photos"][::-1], color="#378ADD")
ax.set_xlabel("Photos"); ax.set_title("Top 15 countries")
plt.tight_layout(); display(fig)
11 — Scheduled Automated jobs
Databricks Jobs run notebooks on a schedule with no manual intervention. Two jobs are recommended: a daily incremental and a weekly full refresh.
Daily — incremental
- Cells 1, 5d, 7a, 7b, 9a–9e
- Only new/changed assets
- Fast — typically under 2 min
- Does not catch deletions
Weekly — full refresh
- Cells 1, 5a, 5b, 5c, 7a, 7b, 9a–9e
- Full sync, catches deletions
- 10–20 min for large libraries
- Suggested: Sunday 3am
Ad-hoc
- Run cell 5d then 7a–9e
- After a big import session
- Trigger from the Jobs page
- Or just run manually
Creating a job
In Databricks, click Workflows in the left sidebar → Create job.
Give it a name, e.g. immich_daily_incremental or immich_weekly_full.
Under Task, set type to Notebook and select your notebook.
Under Cluster, select Serverless (only option in Free Edition).
Click Add trigger → Scheduled. Paste the cron expression for the desired schedule.
Click Create. You can also trigger manually from the job page at any time.
immich_incremental.py starting with cell 1 then cell 5d → 7a → 7b → 9a–9e, and immich_full_refresh.py with cell 1 then 5a → 5b → 5c → 7a → 7b → 9a–9e.0 3 * * * # every day at 3am 0 3 * * 0 # every sunday at 3am ← recommended for full refresh 0 */6 * * * # every 6 hours 0 3 1 * * # first of every month at 3am 0 3 * * 1-5 # weekdays only at 3am
Cron format: minute hour day-of-month month day-of-week. Adjust the hour (3) to your local timezone offset from UTC.
12 — AI/BI Databricks dashboards
Databricks AI/BI Dashboards (Lakeview) connect directly to your gold tables. They're the only dashboard product available in Free Edition and are genuinely good for this use case.
Creating a new dashboard
In the left sidebar click Dashboards → Create dashboard.
Give it a name, e.g. Immich Photo Library.
Click Add a visualization to add your first panel.
Each panel has a SQL editor at the bottom. Paste a query, click Run, then pick a chart type from the right panel.
Drag panels to arrange them. Click Publish when done to get a shareable read-only URL.
workspace with your actual catalog name. You cannot use ${CATALOG} widget syntax in dashboard panels — hardcode the catalog name directly.Panel 1 — photos per year
SELECT year, SUM(asset_count) AS photos FROM workspace.immich_gold.photos_by_month WHERE media_type = 'image' AND year IS NOT NULL GROUP BY year ORDER BY year;
Chart: Bar · X: year · Y: photos
Panel 2 — photos per month heatmap
SELECT year, month, SUM(asset_count) AS photos FROM workspace.immich_gold.photos_by_month WHERE media_type = 'image' AND year IS NOT NULL GROUP BY year, month ORDER BY year, month;
Chart: Heatmap · X: month · Y: year · Color: photos
Panel 3 — top 20 countries
SELECT country, SUM(photo_count) AS photos FROM workspace.immich_gold.geography WHERE country != 'No GPS data' GROUP BY country ORDER BY photos DESC LIMIT 20;
Chart: Bar (horizontal) · X: photos · Y: country
Panel 4 — top 30 cities
SELECT city || ', ' || country AS location, SUM(photo_count) AS photos FROM workspace.immich_gold.geography WHERE city IS NOT NULL AND country != 'No GPS data' GROUP BY city, country ORDER BY photos DESC LIMIT 30;
Chart: Bar (horizontal) · X: photos · Y: location
Panel 5 — camera gear
SELECT make || ' ' || model AS camera, photo_count, first_used_year, last_used_year FROM workspace.immich_gold.camera_summary WHERE make != 'Unknown' ORDER BY photo_count DESC LIMIT 15;
Chart: Bar (horizontal) · X: photo_count · Y: camera
Panel 6 — rating distribution (1=best)
SELECT CASE rating WHEN 1 THEN '1 — First place' WHEN 2 THEN '2 — Second place' WHEN 3 THEN '3 — Third place' WHEN 4 THEN '4 — Fourth place' WHEN 5 THEN '5 — Fifth place' END AS rating_label, photo_count FROM workspace.immich_gold.ratings_summary ORDER BY rating ASC;
Chart: Bar · X: rating_label · Y: photo_count
Counter widgets — add one panel per query
SELECT SUM(asset_count) AS total_photos FROM workspace.immich_gold.photos_by_month WHERE media_type = 'image';
SELECT COUNT(DISTINCT country) AS countries_visited FROM workspace.immich_gold.geography WHERE country != 'No GPS data';
SELECT ROUND(SUM(total_size_gb), 1) AS total_gb FROM workspace.immich_gold.photos_by_month;
SELECT COUNT(*) AS first_place_photos FROM workspace.immich_silver.assets WHERE rating = 1 AND is_trashed = false;
13 — Bonus Interactive Leaflet map
Standalone bonus section. Exports your geography gold table to a self-contained HTML map with Leaflet — dark-themed, circle markers sized by photo count, clickable popups. Serve it from your VPS alongside this tutorial.
immich_gold.geography.import json
geo_df = spark.sql(f"""
SELECT country, city,
ROUND(avg_lat, 4) AS lat,
ROUND(avg_lon, 4) AS lon,
photo_count, earliest_photo, latest_photo
FROM {CATALOG}.immich_gold.geography
WHERE avg_lat IS NOT NULL AND avg_lon IS NOT NULL
AND country != 'No GPS data'
ORDER BY photo_count DESC
""").toPandas()
locations = json.dumps(geo_df.to_dict(orient="records"))
total_photos = int(geo_df["photo_count"].sum())
total_locations = len(geo_df)
print(f"Locations with GPS : {total_locations}")
print(f"Photos mapped : {total_photos}")
print(geo_df[["city","country","photo_count"]].head())
html = f"""Photo Map """ dbutils.fs.put("/FileStore/photo_map.html", html, overwrite=True) print("Saved to /FileStore/photo_map.html") print("Download: Catalog → Files → FileStore in the Databricks UI")Photo Map
{total_photos:,} photos · {total_locations} locations
photo_map.html from Databricks (Catalog → Files → FileStore), then copy to your VPS: scp ~/Downloads/photo_map.html root@tadhock.ovh:/mnt/blockstorage/tutorial/html/map.html — it'll be live at https://tutorial.tadhock.ovh/map.html immediately.