Files
Toju/server/src/routes/link-metadata.ts
2026-05-17 15:14:52 +02:00

292 lines
7.7 KiB
TypeScript

import { Router } from 'express';
import { getLinkPreviewConfig } from '../config/variables';
import { resolveAndValidateHost, safeFetch } from './ssrf-guard';
const router = Router();
const REQUEST_TIMEOUT_MS = 8000;
const MAX_HTML_BYTES = 512 * 1024;
const BYTES_PER_MB = 1024 * 1024;
const MAX_FIELD_LENGTH = 512;
interface CachedMetadata {
title?: string;
description?: string;
imageUrl?: string;
siteName?: string;
failed?: boolean;
cachedAt: number;
}
const metadataCache = new Map<string, CachedMetadata>();
let cacheByteEstimate = 0;
function estimateEntryBytes(key: string, entry: CachedMetadata): number {
let bytes = key.length * 2;
if (entry.title)
bytes += entry.title.length * 2;
if (entry.description)
bytes += entry.description.length * 2;
if (entry.imageUrl)
bytes += entry.imageUrl.length * 2;
if (entry.siteName)
bytes += entry.siteName.length * 2;
return bytes + 64;
}
function cacheSet(key: string, entry: CachedMetadata): void {
const config = getLinkPreviewConfig();
const maxBytes = config.maxCacheSizeMb * BYTES_PER_MB;
if (metadataCache.has(key)) {
const existing = metadataCache.get(key) as CachedMetadata;
cacheByteEstimate -= estimateEntryBytes(key, existing);
}
const entryBytes = estimateEntryBytes(key, entry);
while (cacheByteEstimate + entryBytes > maxBytes && metadataCache.size > 0) {
const oldest = metadataCache.keys().next().value as string;
const oldestEntry = metadataCache.get(oldest) as CachedMetadata;
cacheByteEstimate -= estimateEntryBytes(oldest, oldestEntry);
metadataCache.delete(oldest);
}
metadataCache.set(key, entry);
cacheByteEstimate += entryBytes;
}
function truncateField(value: string | undefined): string | undefined {
if (!value)
return value;
if (value.length <= MAX_FIELD_LENGTH)
return value;
return value.slice(0, MAX_FIELD_LENGTH);
}
function sanitizeImageUrl(rawUrl: string | undefined, baseUrl: string): string | undefined {
if (!rawUrl)
return undefined;
try {
const resolved = new URL(rawUrl, baseUrl);
if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:')
return undefined;
return resolved.href;
} catch {
return undefined;
}
}
function getMetaContent(html: string, patterns: RegExp[]): string | undefined {
for (const pattern of patterns) {
const match = pattern.exec(html);
if (match?.[1])
return decodeHtmlEntities(match[1].trim());
}
return undefined;
}
function decodeHtmlEntities(text: string): string {
return text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#x27;/g, "'")
.replace(/&#x2F;/g, '/');
}
function parseMetadata(html: string, url: string): CachedMetadata {
const title = getMetaContent(html, [
/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:title["']/i,
/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:title["']/i,
/<title[^>]*>([^<]+)<\/title>/i
]);
const description = getMetaContent(html, [
/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:description["']/i,
/<meta[^>]+name=["']twitter:description["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:description["']/i,
/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']description["']/i
]);
const rawImageUrl = getMetaContent(html, [
/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i,
/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:image["']/i
]);
const siteNamePatterns = [
// eslint-disable-next-line @stylistic/js/array-element-newline
/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i,
/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:site_name["']/i
];
const siteName = getMetaContent(html, siteNamePatterns);
const imageUrl = sanitizeImageUrl(rawImageUrl, url);
return {
title: truncateField(title),
description: truncateField(description),
imageUrl,
siteName: truncateField(siteName),
cachedAt: Date.now()
};
}
function evictExpired(): void {
const config = getLinkPreviewConfig();
if (config.cacheTtlMinutes === 0) {
cacheByteEstimate = 0;
metadataCache.clear();
return;
}
const ttlMs = config.cacheTtlMinutes * 60 * 1000;
const now = Date.now();
for (const [key, entry] of metadataCache) {
if (now - entry.cachedAt > ttlMs) {
cacheByteEstimate -= estimateEntryBytes(key, entry);
metadataCache.delete(key);
}
}
}
router.get('/link-metadata', async (req, res) => {
try {
const config = getLinkPreviewConfig();
if (!config.enabled) {
return res.status(403).json({ error: 'Link previews are disabled' });
}
const url = String(req.query.url || '');
if (!/^https?:\/\//i.test(url)) {
return res.status(400).json({ error: 'Invalid URL' });
}
const hostAllowed = await resolveAndValidateHost(url);
if (!hostAllowed) {
return res.status(400).json({ error: 'URL resolves to a blocked address' });
}
evictExpired();
const cached = metadataCache.get(url);
if (cached) {
const { cachedAt: _cachedAt, ...metadata } = cached;
return res.json(metadata);
}
console.log(`[Link Metadata] Cache miss for ${url}. Fetching...`);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
const response = await safeFetch(url, {
signal: controller.signal,
headers: {
'Accept': 'text/html',
'User-Agent': 'MetoYou-LinkPreview/1.0'
}
});
clearTimeout(timeout);
if (!response || !response.ok) {
const failed: CachedMetadata = { failed: true, cachedAt: Date.now() };
cacheSet(url, failed);
return res.json({ failed: true });
}
const contentType = response.headers.get('content-type') || '';
if (!contentType.includes('text/html')) {
const failed: CachedMetadata = { failed: true, cachedAt: Date.now() };
cacheSet(url, failed);
return res.json({ failed: true });
}
const reader = response.body?.getReader();
if (!reader) {
const failed: CachedMetadata = { failed: true, cachedAt: Date.now() };
cacheSet(url, failed);
return res.json({ failed: true });
}
const chunks: Uint8Array[] = [];
let totalBytes = 0;
let done = false;
while (!done) {
const result = await reader.read();
done = result.done;
if (result.value) {
chunks.push(result.value);
totalBytes += result.value.length;
if (totalBytes > MAX_HTML_BYTES) {
reader.cancel();
break;
}
}
}
const html = Buffer.concat(chunks).toString('utf-8');
const metadata = parseMetadata(html, url);
cacheSet(url, metadata);
const { cachedAt, ...result } = metadata;
res.json(result);
} catch (err) {
const url = String(req.query.url || '');
if (url) {
cacheSet(url, { failed: true, cachedAt: Date.now() });
}
if ((err as { name?: string })?.name === 'AbortError') {
return res.json({ failed: true });
}
console.error('Link metadata error:', err);
res.json({ failed: true });
}
});
export default router;