// This file was started by AI and maintained by hand since. import "@paperclover/console/inject"; import { Progress } from "@paperclover/console/Progress"; import { Spinner } from "@paperclover/console/Spinner"; import assert from "node:assert"; import { execFile } from "node:child_process"; import { existsSync, Stats } from "node:fs"; import * as fsp from "node:fs/promises"; import * as path from "node:path"; import { promisify } from "node:util"; import { BlobAsset, cache, FilePermissions, MediaFile } from "../db.ts"; import { formatDate, formatSize } from "./share.ts"; import { highlightCode, type Language } from "./highlight.ts"; const execFileAsync = promisify(execFile); // Configuration const FILE_ROOT = process.env.SCAN_FILE_ROOT; if (!FILE_ROOT) { throw new Error( "FILE_ROOT environment variable not set (e.g. '/path/to/files')", ); } const LOCAL_DIR = path.resolve(FILE_ROOT); const DRY_RUN = process.argv.includes("--dry-run"); const SHOULD_COMPRESS = true; const VERBOSE = process.argv.includes("--verbose"); const SHOULD_SCRUB = true; const COMPRESS_STORE = process.env.COMPRESS_STORE || path.join(process.cwd(), ".clover/compressed"); // Helper function for logging that respects verbose flag function log(message: string, always = false): void { if (always || VERBOSE) { console.log(message); } } // File extensions that need duration metadata const MEDIA_EXTENSIONS = new Set([ ".mp4", ".mkv", ".webm", ".avi", ".mov", ".mp3", ".flac", ".wav", ".ogg", ".m4a", ]); // File extensions that need dimension metadata const IMAGE_EXTENSIONS = new Set([ ".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".heic", ".svg", ]); const VIDEO_EXTENSIONS = new Set([".mp4", ".mkv", ".webm", ".avi", ".mov"]); // File extensions that need metadata scrubbing const SCRUB_EXTENSIONS = new Set([ ".jpg", ".jpeg", ".png", ".mov", ".mp4", ".m4a", ]); const CODE_EXTENSIONS: Record = { ".json": "json", ".toml": "toml", ".ts": "ts", ".js": "ts", ".tsx": "tsx", ".jsx": "tsx", ".css": "css", ".py": "python", ".lua": "lua", ".sh": "shell", ".bat": "dosbatch", ".ps1": "powershell", ".cmd": "dosbatch", ".yaml": "yaml", ".yml": "yaml", ".zig": "zig", ".astro": "astro", ".mdx": "mdx", ".xml": "xml", ".jsonc": "json", ".php": "php", ".patch": "diff", ".diff": "diff", }; const READ_CONTENTS_EXTENSIONS = new Set([".txt", ".chat"]); // For files that have changed indexing logic, update the date here rescanning // will reconstruct the entire file object. This way you can incrementally // update new file types without having to reindex everything. const lastUpdateTypes: Record = {}; lastUpdateTypes[".lnk"] = new Date("2025-05-13 13:58:00"); for (const ext in CODE_EXTENSIONS) { lastUpdateTypes[ext] = new Date("2025-05-13 13:58:00"); } for (const ext of READ_CONTENTS_EXTENSIONS) { lastUpdateTypes[ext] = new Date("2025-05-13 13:58:00"); } lastUpdateTypes[".diff"] = new Date("2025-05-18 13:58:00"); lastUpdateTypes[".patch"] = new Date("2025-05-18 13:58:00"); // Helper functions for metadata extraction async function calculateHash(filePath: string): Promise { try { const hash = await execFileAsync("sha1sum", [filePath]); return hash.stdout.split(" ")[0]; } catch (error) { console.error(`Error calculating hash for ${filePath}:`, error); throw error; } } async function calculateDuration(filePath: string): Promise { try { const ext = path.extname(filePath).toLowerCase(); if (!MEDIA_EXTENSIONS.has(ext)) return 0; const { stdout } = await execFileAsync("ffprobe", [ "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", filePath, ]); return Math.ceil(parseFloat(stdout.trim())); } catch (error) { console.error(`Error calculating duration for ${filePath}:`, error); return 0; // Return 0 for duration on error } } async function calculateDimensions(filePath: string): Promise { const ext = path.extname(filePath).toLowerCase(); if (!IMAGE_EXTENSIONS.has(ext) && !VIDEO_EXTENSIONS.has(ext)) return ""; try { if (ext === ".svg") { // For SVG files, parse the file and extract width/height const content = await fsp.readFile(filePath, "utf8"); const widthMatch = content.match(/width="(\d+)"/); const heightMatch = content.match(/height="(\d+)"/); if (widthMatch && heightMatch) { return `${widthMatch[1]}x${heightMatch[1]}`; } } else if (IMAGE_EXTENSIONS.has(ext) || VIDEO_EXTENSIONS.has(ext)) { // Use ffprobe for images and videos const { stdout } = await execFileAsync("ffprobe", [ "-v", "error", "-select_streams", "v:0", "-show_entries", "stream=width,height", "-of", "csv=s=x:p=0", filePath, ]); return stdout.trim(); } } catch (error) { console.error(`Error calculating dimensions for ${filePath}:`, error); } return ""; } // Helper function to check and remove location metadata async function scrubLocationMetadata( filePath: string, stats: Stats, ): Promise { try { const ext = path.extname(filePath).toLowerCase(); if (!SCRUB_EXTENSIONS.has(ext)) return false; let hasLocation = false; let args: string[] = []; // Check for location metadata based on file type const tempOutput = path.join( path.dirname(filePath), `.tmp.${path.basename(filePath)}`, ); switch (ext) { case ".jpg": case ".jpeg": case ".png": // Check for GPS tags in EXIF const { stdout: gpsCheck } = await execFileAsync("exiftool", [ "-gps:all", filePath, ]); hasLocation = gpsCheck.trim().length > 0; args = ["-gps:all=", filePath, "-o", tempOutput]; break; case ".mov": case ".mp4": // Check for GPS metadata in video files const { stdout: videoCheck } = await execFileAsync("exiftool", [ "-ee", "-G3", "-s", filePath, ]); hasLocation = videoCheck.includes("GPS") || videoCheck.includes("Location"); args = ["-gps:all=", "-xmp:all=", filePath, "-o", tempOutput]; break; case ".m4a": // Check for location and other metadata in m4a files const { stdout: m4aCheck } = await execFileAsync("exiftool", [ "-ee", "-G3", "-s", filePath, ]); hasLocation = m4aCheck.includes("GPS") || m4aCheck.includes("Location") || m4aCheck.includes("Filename") || m4aCheck.includes("Title"); if (hasLocation) { args = [ "-gps:all=", "-location:all=", "-filename:all=", "-title=", "-m4a:all=", filePath, "-o", tempOutput, ]; } break; } const accessTime = stats.atime; const modTime = stats.mtime; let backup: string | null = null; try { if (hasLocation) { if (DRY_RUN) return true; // Prepare a backup const tmp = path.join( path.dirname(filePath), `.tmp.backup.${path.basename(filePath)}`, ); await fsp.copyFile(filePath, tmp); await fsp.utimes(tmp, accessTime, modTime); backup = tmp; // Remove metadata await execFileAsync("exiftool", args); if (!existsSync(tempOutput)) { throw new Error(`Failed to create output file: ${tempOutput}`); } // Restore original timestamps await fsp.rename(tempOutput, filePath); await fsp.utimes(filePath, accessTime, modTime); // Backup is no longer needed await fsp.unlink(backup); log( `Scrubbed location metadata in ${path.relative(LOCAL_DIR, filePath)}`, true, ); return true; } } catch (error) { if (backup) { await fsp.rename(backup, filePath); } if (existsSync(tempOutput)) { await fsp.unlink(tempOutput); } throw error; } } catch (error) { console.error(`Error scrubbing metadata for ${filePath}:`, error); } return false; } // Queue implementation for parallel processing type AsyncQueueProcessor = (s: Spinner, item: T) => Promise; class AsyncQueue { private queue: T[] = []; private running = 0; private maxConcurrent: number; private processed = 0; private progress?: Progress<{ active: Spinner[] }>; private name: string; private estimate?: number; constructor(name: string, maxConcurrent: number) { this.maxConcurrent = maxConcurrent; this.name = name; } setEstimate(estimate: number) { this.estimate = estimate; if (this.progress) { this.progress.total = Math.max( this.processed + this.queue.length, estimate, ); } } getProgress() { if (!this.progress) { this.progress = new Progress({ spinner: null, text: ({ active }) => { const now = performance.now(); let text = `[${this.processed}/${ this.processed + this.queue.length }] ${this.name}`; let n = 0; for (const item of active) { let itemText = "- " + item.format(now); text += `\n` + itemText.slice(0, Math.max(0, process.stdout.columns - 1)); if (n > 10) { text += `\n ... + ${active.length - n} more`; break; } n++; } return text; }, props: { active: [] as Spinner[], }, }); this.progress.total = this.estimate ?? 0; this.progress.value = 0; this.progress.fps = 30; } return this.progress; } async add(item: T, processor: AsyncQueueProcessor): Promise { this.queue.push(item); this.getProgress().total = Math.max( this.processed + this.queue.length, this.estimate ?? 0, ); return this.processNext(processor); } async addBatch(items: T[], processor: AsyncQueueProcessor): Promise { this.queue.push(...items); this.getProgress().total = Math.max( this.processed + this.queue.length, this.estimate ?? 0, ); return this.processNext(processor); } private async processNext(processor: AsyncQueueProcessor): Promise { if (this.running >= this.maxConcurrent || this.queue.length === 0) { return; } const item = this.queue.shift(); if (!item) return; this.running++; try { const progress = this.getProgress(); let itemText = ""; if (typeof item === "string") { itemText = item; } else if (typeof item === "object" && item !== null && "path" in item) { itemText = "" + item.path; } else { itemText = JSON.stringify(item); } if (itemText.startsWith(LOCAL_DIR)) { itemText = path.relative(LOCAL_DIR, itemText); } const spinner = new Spinner(itemText); spinner.stop(); progress.props.active.unshift(spinner); await processor(spinner, item); progress.props = { active: progress.props.active.filter((s) => s !== spinner), }; this.processed++; progress.value = this.processed; } catch (error) { console.error(`Error processing ${this.name} queue item:`, error); this.processed++; this.getProgress().value = this.processed; } finally { this.running--; await this.processNext(processor); } } async waitForCompletion(): Promise { if (this.queue.length === 0 && this.running === 0) { if (this.processed > 0) { this.#success(); } return; } return new Promise((resolve) => { const checkInterval = setInterval(() => { if (this.queue.length === 0 && this.running === 0) { clearInterval(checkInterval); this.#success(); resolve(); } }, 100); }); } #success() { this.getProgress().success(`${this.processed} ${this.name}`); } } function skipBasename(basename: string): boolean { // dot files must be incrementally tracked if (basename === ".dirsort") return true; if (basename === ".friends") return true; return ( basename.startsWith(".") || basename.startsWith("._") || basename.startsWith(".tmp") || basename === ".DS_Store" || basename.toLowerCase() === "thumbs.db" || basename.toLowerCase() === "desktop.ini" ); } // File system scanner class FileSystemScanner { private visitedPaths = new Set(); private previousPaths = new Set(); private dirQueue = new AsyncQueue("Scan Directories", 10); private fileQueue = new AsyncQueue<{ path: string; stat: any }>( "File metadata", 20, ); private compressQueue: AsyncQueue<{ file: MediaFile; path: string }> | null = SHOULD_COMPRESS ? new AsyncQueue("Compress Assets", 10) : null; private getDbPath(localPath: string): string { // Convert local file system path to database path const relativePath = path.relative(LOCAL_DIR, localPath); return "/" + relativePath.split(path.sep).join(path.posix.sep); } private getLocalPath(dbPath: string): string { // Convert database path to local file system path return path.join(LOCAL_DIR, dbPath.slice(1)); } async scanFile(s: Spinner, filePath: string, stat: any): Promise { const dbPath = this.getDbPath(filePath); // Skip hidden files const basename = path.basename(filePath); if (skipBasename(basename)) { return; } this.visitedPaths.add(dbPath); // Get existing file info from db const existingFile = MediaFile.getByPath(dbPath); // Determine which date to use (for date protection) let dateToUse = stat.mtime; const year2025Start = new Date("2025-01-01T00:00:00Z"); if ( existingFile && existingFile.date < year2025Start && stat.mtime >= year2025Start ) { console.error( `Error: ${dbPath} is ${ formatDate( existingFile.date, ) }, got modified to ${formatDate(stat.mtime)}`, ); dateToUse = existingFile.date; } // Check if we need to reprocess the file if (existingFile && existingFile.size === stat.size && existingFile.hash) { maybe_skip: { const lastUpdateDate = lastUpdateTypes[path.extname(filePath)]; if (lastUpdateDate && existingFile.lastUpdateDate < lastUpdateDate) { console.log( `Reprocessing ${dbPath} because indexing logic changed after ${ formatDate( lastUpdateDate, ) }`, ); break maybe_skip; } if (SHOULD_COMPRESS && existingFile.processed !== 2) { this.compressQueue!.add( { file: existingFile, path: dbPath }, this.compressFile.bind(this), ); } // File hasn't changed, no need to reprocess MediaFile.createFile({ path: dbPath, date: dateToUse, hash: existingFile.hash, size: stat.size, duration: existingFile.duration, dimensions: existingFile.dimensions, content: existingFile.contents, }); return; } } // Process the file log(`Processing file: ${dbPath}`); // Scrub location metadata if needed if (SHOULD_SCRUB) { if (await scrubLocationMetadata(filePath, stat)) { // Re-stat the file in case it was modified const newStat = await fsp.stat(filePath); stat.size = newStat.size; } } // Extract content const hash = await calculateHash(filePath); let content = ""; if (filePath.endsWith(".lnk")) { content = (await fsp.readFile(filePath, "utf8")).trim(); } const language = CODE_EXTENSIONS[path.extname(filePath)]; if (language) { read_code: { // An issue is that .ts is an overloaded extension, shared between // 'transport stream' and 'typescript'. // // Filter used here is: // - more than 1mb // - invalid UTF-8 if (stat.size > 1_000_000) break read_code; let code; const buf = await fsp.readFile(filePath); try { code = new TextDecoder("utf-8", { fatal: true }).decode(buf); } catch (error) { break read_code; } content = await highlightCode(code, language); } } if (!content && READ_CONTENTS_EXTENSIONS.has(path.extname(filePath))) { content = await fsp.readFile(filePath, "utf8"); } // End extract content if (hash === existingFile?.hash) { MediaFile.createFile({ path: dbPath, date: dateToUse, hash, size: stat.size, duration: existingFile.duration, dimensions: existingFile.dimensions, content, }); return; } else if (existingFile) { if (existingFile.processed === 2) { if (BlobAsset.decrementOrDelete(existingFile.hash)) { log( `Deleted compressed asset ${existingFile.hash}.{gzip, zstd}`, true, ); await fsp.unlink( path.join( COMPRESS_STORE, existingFile.hash.substring(0, 2), existingFile.hash + ".gz", ), ); await fsp.unlink( path.join( COMPRESS_STORE, existingFile.hash.substring(0, 2), existingFile.hash + ".zstd", ), ); } } } const [duration, dimensions] = await Promise.all([ calculateDuration(filePath), calculateDimensions(filePath), ]); // Update database with all metadata MediaFile.createFile({ path: dbPath, date: dateToUse, hash, size: stat.size, duration, dimensions, content, }); if (SHOULD_COMPRESS) { this.compressQueue!.add( { file: MediaFile.getByPath(dbPath)!, path: dbPath, }, this.compressFile.bind(this), ); } } async compressFile(s: Spinner, { file }: { file: MediaFile }): Promise { log(`Compressing file: ${file.path}`); if (DRY_RUN) return; const filePath = path.join(FILE_ROOT!, file.path); const hash = file.hash; const firstTwoChars = hash.substring(0, 2); const compressDir = `${COMPRESS_STORE}/${firstTwoChars}`; const compressPath = `${compressDir}/${hash}`; // Create directory structure if it doesn't exist await fsp.mkdir(compressDir, { recursive: true }); // Compress the file with gzip const blob = BlobAsset.putOrIncrement(hash); if (blob.refs > 1) { log( `Skipping compression of ${filePath} because it already exists in ${compressPath}`, ); return; } // Check if already exists if (existsSync(compressPath + ".gz")) { file.setCompressed(true); return; } try { const gzipProcess = Bun.spawn(["gzip", "-c", filePath, "-9"], { stdout: Bun.file(compressPath + ".gz"), }); const zstdProcess = Bun.spawn(["zstd", "-c", filePath, "-9"], { stdout: Bun.file(compressPath + ".zstd"), }); const [gzipExited, zstdExited] = await Promise.all([ gzipProcess.exited, zstdProcess.exited, ]); assert(gzipExited === 0); assert(zstdExited === 0); assert(existsSync(compressPath + ".gz")); assert(existsSync(compressPath + ".zstd")); file.setCompressed(true); } catch (error) { console.error(`Error compressing file ${filePath}:`, error); BlobAsset.decrementOrDelete(hash); file.setCompressed(false); } } async scanDirectory(s: Spinner, dirPath: string): Promise { const dbPath = this.getDbPath(dirPath); this.visitedPaths.add(dbPath); // Create or update directory entry log(`Scanning directory: ${dbPath}`); if (!DRY_RUN) { MediaFile.createOrUpdateDirectory(dbPath); } try { const entries = await fsp.readdir(dirPath, { withFileTypes: true }); // Process files and subdirectories for (const entry of entries) { const entryPath = path.join(dirPath, entry.name); // Skip hidden files and system files if (skipBasename(entry.name)) { continue; } if (entry.isDirectory()) { // Queue subdirectory for scanning this.dirQueue.add(entryPath, this.scanDirectory.bind(this)); } else if (entry.isFile()) { // Queue file for processing const stat = await fsp.stat(entryPath); this.fileQueue.add( { path: entryPath, stat }, async (s, item) => await this.scanFile(s, item.path, item.stat), ); } } } catch (error) { console.error(`Error scanning directory ${dirPath}:`, error); } } async processDirectoryMetadata(dirPath: string): Promise { const dbPath = this.getDbPath(dirPath); const dir = MediaFile.getByPath(dbPath); if (!dir || dir.kind !== MediaFile.Kind.directory) { return; } if (DRY_RUN) return; const children = dir.getChildren(); // Calculate directory metadata let totalSize = 0; let newestDate = new Date(0); let allHashes = ""; // Check for readme.txt let readmeContent = ""; try { readmeContent = await fsp.readFile( path.join(dirPath, "readme.txt"), "utf8", ); } catch (error: any) { console.info(`no readme ${dirPath}`); if (error.code !== "ENOENT") { console.error(`Error reading readme.txt in ${dirPath}:`, error); } } let dirsort: string[] | null = null; try { dirsort = (await fsp.readFile(path.join(dirPath, ".dirsort"), "utf8")) .split("\n") .map((x) => x.trim()) .filter(Boolean); } catch (error: any) { if (error.code !== "ENOENT") { console.error(`Error reading .dirsort in ${dirPath}:`, error); } } if (await fsp.exists(path.join(dirPath, ".friends"))) { FilePermissions.setPermissions(dbPath, 1); } else { FilePermissions.setPermissions(dbPath, 0); } // Process children for (const child of children) { totalSize += child.size; allHashes += child.hash; // Update newest date, ignoring readme.txt if (!child.path.endsWith("/readme.txt") && child.date > newestDate) { newestDate = child.date; } } // Create a hash for the directory const dirHash = new Bun.CryptoHasher("sha1") .update(dbPath + allHashes) .digest("hex"); // Update directory metadata MediaFile.markDirectoryProcessed({ id: dir.id, timestamp: newestDate, contents: readmeContent, size: totalSize, hash: dirHash, dirsort, }); } async findDeletedFiles(): Promise { if (DRY_RUN) return; // Find all paths that exist in the DB but not in the filesystem const deletedPaths = Array.from(this.previousPaths).filter( (path) => !this.visitedPaths.has(path), ); for (const dbPath of deletedPaths) { const file = MediaFile.getByPath(dbPath); if (!file) continue; log(`Item Deleted: ${dbPath}`, true); if (file.processed === 2) { if (BlobAsset.decrementOrDelete(file.hash)) { log(`Deleted compressed asset ${file.hash}.{gzip, zstd}`, true); await fsp.unlink( path.join( COMPRESS_STORE, file.hash.substring(0, 2), file.hash + ".gz", ), ); await fsp.unlink( path.join( COMPRESS_STORE, file.hash.substring(0, 2), file.hash + ".zstd", ), ); } } MediaFile.deleteByPath(dbPath); } } async loadPreviousPaths(): Promise { // Get all files and directories from the database // This uses a custom query to get all paths at once const getAllPathsQuery = cache .prepare(`SELECT path, kind FROM media_files`) .all() as { path: string; kind: MediaFile.Kind; }[]; let dirs = 0; let files = 0; for (const row of getAllPathsQuery) { this.previousPaths.add(row.path); if (row.kind === MediaFile.Kind.directory) { dirs++; } else { files++; } } this.dirQueue.setEstimate(dirs); this.fileQueue.setEstimate(files); // log(`Loaded ${this.previousPaths.size} paths from database`, true); } async scan(): Promise { log(`Starting file system scan in ${LOCAL_DIR}`, true); // Check if the root directory exists and is accessible try { const rootStat = await fsp.stat(LOCAL_DIR); if (!rootStat.isDirectory()) { throw new Error(`${LOCAL_DIR} is not a directory`); } } catch (error) { console.error(`Error: Cannot access root directory ${LOCAL_DIR}`, error); console.error( `Aborting scan to prevent database corruption. Please check if the volume is mounted.`, ); process.exit(1); } await this.loadPreviousPaths(); await this.dirQueue.add(LOCAL_DIR, this.scanDirectory.bind(this)); await this.dirQueue.waitForCompletion(); await this.fileQueue.waitForCompletion(); await this.findDeletedFiles(); const allDirs = Array.from(this.visitedPaths) .filter((path) => { const file = MediaFile.getByPath(path); return file && file.kind === MediaFile.Kind.directory; }) .sort((a, b) => b.length - a.length); const dirMetadataQueue = new AsyncQueue("Directory Metadata", 10); for (const dirPath of allDirs) { await this.processDirectoryMetadata(this.getLocalPath(dirPath)); } await dirMetadataQueue.waitForCompletion(); if (SHOULD_COMPRESS) { await this.compressQueue!.waitForCompletion(); } log("Scan completed successfully!", true); } } // Main execution function showHelp() { console.log(` MediaFile Scanner - Index filesystem content for paperclover.net Environment variables: FILE_ROOT Required. Path to the directory to scan COMPRESS_STORE Optional. Path to store compressed files (default: .clover/compressed) Options: --help Show this help message --dry-run Don't make any changes to the database --verbose Show detailed output Usage: bun ./media/scan.ts [options] `); process.exit(0); } { // Show help if requested if (process.argv.includes("--help")) { showHelp(); process.exit(0); } // Check if the root directory exists before starting if (!existsSync(LOCAL_DIR)) { console.error( `Error: Root directory ${LOCAL_DIR} does not exist or is not accessible.`, ); console.error(`Please check if the volume is mounted correctly.`); process.exit(1); } const startTime = Date.now(); try { const scanner = new FileSystemScanner(); await scanner.scan(); const endTime = Date.now(); log(`Scan completed in ${(endTime - startTime) / 1000} seconds`, true); const rootDir = MediaFile.getByPath("/")!; const totalEntries = cache .prepare(`SELECT COUNT(*) as count FROM media_files`) .get() as { count: number }; const totalDuration = cache .prepare(`SELECT SUM(duration) as duration FROM media_files`) .get() as { duration: number }; console.log(); console.log("Global Stats"); console.log(` Entry count: ${totalEntries.count}`); console.log(` Uncompressed size: ${formatSize(rootDir.size)}`); console.log( ` Total audio/video duration: ${ ( totalDuration.duration / 60 / 60 ).toFixed(1) } hours`, ); } catch (error) { console.error("Error during scan:", error); process.exit(1); } }