mirror of
https://github.com/jakejarvis/hoot.git
synced 2025-10-18 20:14:25 -04:00
391 lines
11 KiB
TypeScript
391 lines
11 KiB
TypeScript
import { eq } from "drizzle-orm";
|
|
import { getDomainTld } from "rdapper";
|
|
import { captureServer } from "@/lib/analytics/server";
|
|
import { acquireLockOrWaitForResult } from "@/lib/cache";
|
|
import { SOCIAL_PREVIEW_TTL_SECONDS, USER_AGENT } from "@/lib/constants";
|
|
import { toRegistrableDomain } from "@/lib/domain-server";
|
|
import { fetchWithTimeout } from "@/lib/fetch";
|
|
import { optimizeImageCover } from "@/lib/image";
|
|
import { ns, redis } from "@/lib/redis";
|
|
import type {
|
|
GeneralMeta,
|
|
OpenGraphMeta,
|
|
RobotsTxt,
|
|
SeoResponse,
|
|
TwitterMeta,
|
|
} from "@/lib/schemas";
|
|
import { parseHtmlMeta, parseRobotsTxt, selectPreview } from "@/lib/seo";
|
|
import { makeImageFileName, uploadImage } from "@/lib/storage";
|
|
import { db } from "@/server/db/client";
|
|
import { seo as seoTable } from "@/server/db/schema";
|
|
import { ttlForSeo } from "@/server/db/ttl";
|
|
import { upsertDomain } from "@/server/repos/domains";
|
|
import { upsertSeo } from "@/server/repos/seo";
|
|
|
|
const SOCIAL_WIDTH = 1200;
|
|
const SOCIAL_HEIGHT = 630;
|
|
|
|
export async function getSeo(domain: string): Promise<SeoResponse> {
|
|
console.debug("[seo] start", { domain });
|
|
// Fast path: DB
|
|
const registrable = toRegistrableDomain(domain);
|
|
const d = registrable
|
|
? await upsertDomain({
|
|
name: registrable,
|
|
tld: getDomainTld(registrable) ?? "",
|
|
unicodeName: domain,
|
|
})
|
|
: null;
|
|
const existing = d
|
|
? await db
|
|
.select({
|
|
sourceFinalUrl: seoTable.sourceFinalUrl,
|
|
sourceStatus: seoTable.sourceStatus,
|
|
metaOpenGraph: seoTable.metaOpenGraph,
|
|
metaTwitter: seoTable.metaTwitter,
|
|
metaGeneral: seoTable.metaGeneral,
|
|
previewTitle: seoTable.previewTitle,
|
|
previewDescription: seoTable.previewDescription,
|
|
previewImageUrl: seoTable.previewImageUrl,
|
|
previewImageUploadedUrl: seoTable.previewImageUploadedUrl,
|
|
canonicalUrl: seoTable.canonicalUrl,
|
|
robots: seoTable.robots,
|
|
errors: seoTable.errors,
|
|
expiresAt: seoTable.expiresAt,
|
|
})
|
|
.from(seoTable)
|
|
.where(eq(seoTable.domainId, d.id))
|
|
: ([] as Array<{
|
|
sourceFinalUrl: string | null;
|
|
sourceStatus: number | null;
|
|
metaOpenGraph: OpenGraphMeta;
|
|
metaTwitter: TwitterMeta;
|
|
metaGeneral: GeneralMeta;
|
|
previewTitle: string | null;
|
|
previewDescription: string | null;
|
|
previewImageUrl: string | null;
|
|
previewImageUploadedUrl: string | null;
|
|
canonicalUrl: string | null;
|
|
robots: RobotsTxt;
|
|
errors: Record<string, unknown>;
|
|
expiresAt: Date | null;
|
|
}>);
|
|
if (existing[0] && (existing[0].expiresAt?.getTime?.() ?? 0) > Date.now()) {
|
|
const preview = existing[0].canonicalUrl
|
|
? {
|
|
title: existing[0].previewTitle ?? null,
|
|
description: existing[0].previewDescription ?? null,
|
|
image: existing[0].previewImageUrl ?? null,
|
|
imageUploaded: existing[0].previewImageUploadedUrl ?? null,
|
|
canonicalUrl: existing[0].canonicalUrl,
|
|
}
|
|
: null;
|
|
// Ensure uploaded image URL is still valid; refresh via Redis-backed cache
|
|
if (preview?.image) {
|
|
try {
|
|
const refreshed = await getOrCreateSocialPreviewImageUrl(
|
|
registrable ?? domain,
|
|
preview.image,
|
|
);
|
|
preview.imageUploaded = refreshed?.url ?? preview.imageUploaded ?? null;
|
|
} catch {
|
|
// keep as-is on transient errors
|
|
}
|
|
}
|
|
const response: SeoResponse = {
|
|
meta: {
|
|
openGraph: existing[0].metaOpenGraph as OpenGraphMeta,
|
|
twitter: existing[0].metaTwitter as TwitterMeta,
|
|
general: existing[0].metaGeneral as GeneralMeta,
|
|
},
|
|
robots: existing[0].robots as RobotsTxt,
|
|
preview,
|
|
source: {
|
|
finalUrl: existing[0].sourceFinalUrl ?? null,
|
|
status: existing[0].sourceStatus ?? null,
|
|
},
|
|
errors: existing[0].errors as Record<string, unknown> as {
|
|
html?: string;
|
|
robots?: string;
|
|
},
|
|
};
|
|
return response;
|
|
}
|
|
|
|
let finalUrl: string = `https://${registrable ?? domain}/`;
|
|
let status: number | null = null;
|
|
let htmlError: string | undefined;
|
|
let robotsError: string | undefined;
|
|
|
|
let meta: ReturnType<typeof parseHtmlMeta> | null = null;
|
|
let robots: ReturnType<typeof parseRobotsTxt> | null = null;
|
|
|
|
// HTML fetch
|
|
try {
|
|
const res = await fetchWithTimeout(
|
|
finalUrl,
|
|
{
|
|
method: "GET",
|
|
redirect: "follow",
|
|
headers: {
|
|
Accept:
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en",
|
|
"User-Agent": USER_AGENT,
|
|
},
|
|
},
|
|
{ timeoutMs: 10000, retries: 1, backoffMs: 200 },
|
|
);
|
|
status = res.status;
|
|
finalUrl = res.url;
|
|
const contentType = res.headers.get("content-type") ?? "";
|
|
if (!/^(text\/html|application\/xhtml\+xml)\b/i.test(contentType)) {
|
|
htmlError = `Non-HTML content-type: ${contentType}`;
|
|
} else {
|
|
const text = await res.text();
|
|
const max = 512 * 1024;
|
|
const html = text.length > max ? text.slice(0, max) : text;
|
|
meta = parseHtmlMeta(html, finalUrl);
|
|
}
|
|
} catch (err) {
|
|
htmlError = String(err);
|
|
}
|
|
|
|
// robots.txt fetch (no Redis cache; stored in Postgres with row TTL)
|
|
try {
|
|
const robotsUrl = `https://${registrable ?? domain}/robots.txt`;
|
|
const res = await fetchWithTimeout(
|
|
robotsUrl,
|
|
{
|
|
method: "GET",
|
|
headers: { Accept: "text/plain", "User-Agent": USER_AGENT },
|
|
},
|
|
{ timeoutMs: 8000 },
|
|
);
|
|
if (res.ok) {
|
|
const ct = res.headers.get("content-type") ?? "";
|
|
if (/^text\/(plain|html|xml)?($|;|,)/i.test(ct)) {
|
|
const txt = await res.text();
|
|
robots = parseRobotsTxt(txt, { baseUrl: robotsUrl });
|
|
} else {
|
|
robotsError = `Unexpected robots content-type: ${ct}`;
|
|
}
|
|
} else {
|
|
robotsError = `HTTP ${res.status}`;
|
|
}
|
|
} catch (err) {
|
|
robotsError = String(err);
|
|
}
|
|
|
|
const preview = meta ? selectPreview(meta, finalUrl) : null;
|
|
|
|
// If a social image is present, store a cached copy via UploadThing for privacy
|
|
if (preview?.image) {
|
|
try {
|
|
const stored = await getOrCreateSocialPreviewImageUrl(
|
|
registrable ?? domain,
|
|
preview.image,
|
|
);
|
|
// Preserve original image URL for meta display; attach uploaded URL for rendering
|
|
preview.imageUploaded = stored?.url ?? null;
|
|
} catch {
|
|
// On failure, avoid rendering external image URL
|
|
preview.imageUploaded = null;
|
|
}
|
|
}
|
|
|
|
const response: SeoResponse = {
|
|
meta,
|
|
robots,
|
|
preview,
|
|
source: { finalUrl, status },
|
|
...(htmlError || robotsError
|
|
? {
|
|
errors: {
|
|
...(htmlError ? { html: htmlError } : {}),
|
|
...(robotsError ? { robots: robotsError } : {}),
|
|
},
|
|
}
|
|
: {}),
|
|
};
|
|
|
|
// Persist to Postgres only when we have a domainId
|
|
const now = new Date();
|
|
if (d) {
|
|
await upsertSeo({
|
|
domainId: d.id,
|
|
sourceFinalUrl: response.source.finalUrl ?? null,
|
|
sourceStatus: response.source.status ?? null,
|
|
metaOpenGraph: response.meta?.openGraph ?? ({} as OpenGraphMeta),
|
|
metaTwitter: response.meta?.twitter ?? ({} as TwitterMeta),
|
|
metaGeneral: response.meta?.general ?? ({} as GeneralMeta),
|
|
previewTitle: response.preview?.title ?? null,
|
|
previewDescription: response.preview?.description ?? null,
|
|
previewImageUrl: response.preview?.image ?? null,
|
|
previewImageUploadedUrl: response.preview?.imageUploaded ?? null,
|
|
canonicalUrl: response.preview?.canonicalUrl ?? null,
|
|
robots: robots ?? ({} as RobotsTxt),
|
|
robotsSitemaps: response.robots?.sitemaps ?? [],
|
|
errors: response.errors ?? {},
|
|
fetchedAt: now,
|
|
expiresAt: ttlForSeo(now),
|
|
});
|
|
}
|
|
|
|
await captureServer("seo_fetch", {
|
|
domain: registrable ?? domain,
|
|
status: status ?? -1,
|
|
has_meta: !!meta,
|
|
has_robots: !!robots,
|
|
has_errors: Boolean(htmlError || robotsError),
|
|
});
|
|
|
|
console.info("[seo] ok", {
|
|
domain: registrable ?? domain,
|
|
status: status ?? -1,
|
|
has_meta: !!meta,
|
|
has_robots: !!robots,
|
|
has_errors: Boolean(htmlError || robotsError),
|
|
});
|
|
|
|
return response;
|
|
}
|
|
|
|
async function getOrCreateSocialPreviewImageUrl(
|
|
domain: string,
|
|
imageUrl: string,
|
|
): Promise<{ url: string | null }> {
|
|
const startedAt = Date.now();
|
|
const lower = domain.toLowerCase();
|
|
const fileId = makeImageFileName(
|
|
"social",
|
|
lower,
|
|
SOCIAL_WIDTH,
|
|
SOCIAL_HEIGHT,
|
|
imageUrl,
|
|
);
|
|
|
|
const indexKey = ns(
|
|
"seo",
|
|
"image-url",
|
|
lower,
|
|
fileId,
|
|
`${SOCIAL_WIDTH}x${SOCIAL_HEIGHT}`,
|
|
);
|
|
const lockKey = ns(
|
|
"lock",
|
|
"seo-image",
|
|
lower,
|
|
fileId,
|
|
`${SOCIAL_WIDTH}x${SOCIAL_HEIGHT}`,
|
|
);
|
|
|
|
// 1) Check Redis index first
|
|
try {
|
|
const raw = (await redis.get(indexKey)) as { url?: unknown } | null;
|
|
if (
|
|
raw &&
|
|
typeof raw === "object" &&
|
|
typeof (raw as { url?: unknown }).url === "string"
|
|
) {
|
|
await captureServer("seo_image", {
|
|
domain: lower,
|
|
width: SOCIAL_WIDTH,
|
|
height: SOCIAL_HEIGHT,
|
|
source: "redis",
|
|
duration_ms: Date.now() - startedAt,
|
|
outcome: "ok",
|
|
cache: "hit",
|
|
});
|
|
return { url: (raw as { url: string }).url };
|
|
}
|
|
} catch {
|
|
// ignore and continue
|
|
}
|
|
|
|
// 2) Acquire lock or wait for another process to complete
|
|
const lockResult = await acquireLockOrWaitForResult<{ url: string }>({
|
|
lockKey,
|
|
resultKey: indexKey,
|
|
lockTtl: 30,
|
|
});
|
|
|
|
if (!lockResult.acquired) {
|
|
if (lockResult.cachedResult?.url) {
|
|
await captureServer("seo_image", {
|
|
domain: lower,
|
|
width: SOCIAL_WIDTH,
|
|
height: SOCIAL_HEIGHT,
|
|
source: "redis_wait",
|
|
duration_ms: Date.now() - startedAt,
|
|
outcome: "ok",
|
|
cache: "wait",
|
|
});
|
|
return { url: lockResult.cachedResult.url };
|
|
}
|
|
return { url: null };
|
|
}
|
|
|
|
// 3) We acquired the lock - fetch, process, upload
|
|
try {
|
|
const res = await fetchWithTimeout(
|
|
imageUrl,
|
|
{
|
|
method: "GET",
|
|
headers: {
|
|
Accept:
|
|
"image/avif,image/webp,image/png,image/jpeg,image/*;q=0.9,*/*;q=0.8",
|
|
"User-Agent": USER_AGENT,
|
|
},
|
|
},
|
|
{ timeoutMs: 8000 },
|
|
);
|
|
|
|
if (!res.ok) return { url: null };
|
|
const contentType = res.headers.get("content-type") ?? "";
|
|
if (!/image\//.test(contentType)) return { url: null };
|
|
|
|
const ab = await res.arrayBuffer();
|
|
const raw = Buffer.from(ab);
|
|
|
|
const image = await optimizeImageCover(raw, SOCIAL_WIDTH, SOCIAL_HEIGHT);
|
|
if (!image || image.length === 0) return { url: null };
|
|
|
|
const { url, key } = await uploadImage({
|
|
kind: "social",
|
|
domain: lower,
|
|
width: SOCIAL_WIDTH,
|
|
height: SOCIAL_HEIGHT,
|
|
buffer: image,
|
|
});
|
|
|
|
try {
|
|
const ttl = SOCIAL_PREVIEW_TTL_SECONDS;
|
|
const expiresAtMs = Date.now() + ttl * 1000;
|
|
await redis.set(indexKey, { url, key, expiresAtMs }, { ex: ttl });
|
|
await redis.zadd(ns("purge", "social"), {
|
|
score: expiresAtMs,
|
|
member: key,
|
|
});
|
|
} catch {}
|
|
|
|
await captureServer("seo_image", {
|
|
domain: lower,
|
|
width: SOCIAL_WIDTH,
|
|
height: SOCIAL_HEIGHT,
|
|
source: "upload",
|
|
duration_ms: Date.now() - startedAt,
|
|
outcome: "ok",
|
|
cache: "store",
|
|
});
|
|
|
|
return { url };
|
|
} catch {
|
|
return { url: null };
|
|
} finally {
|
|
try {
|
|
await redis.del(lockKey);
|
|
} catch {}
|
|
}
|
|
}
|