diff --git a/.gitignore b/.gitignore index fe0d794..41e89a8 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ dist/ *.tsbuildinfo out/ +# Scraper intermediate data +apps/scraper/data/ + # MinIO data (dev) minio-data/ diff --git a/apps/scraper/.env.example b/apps/scraper/.env.example new file mode 100644 index 0000000..c025cd1 --- /dev/null +++ b/apps/scraper/.env.example @@ -0,0 +1,8 @@ +# Payload CMS (for direct import mode) +PAYLOAD_API_URL=http://localhost:3000/api +PAYLOAD_EMAIL=admin@advdoors.ru +PAYLOAD_PASSWORD= + +# OpenRouter (for LLM processing) +OPENROUTER_API_KEY= +OPENROUTER_MODEL=google/gemini-3.1-pro-preview diff --git a/apps/scraper/package.json b/apps/scraper/package.json index 2d8e5f9..fee98ca 100644 --- a/apps/scraper/package.json +++ b/apps/scraper/package.json @@ -4,13 +4,17 @@ "private": true, "type": "module", "scripts": { - "dev": "tsx src/index.ts", + "dev": "tsx --env-file=.env src/index.ts", "build": "tsc", - "scrape": "tsx src/index.ts" + "scrape": "tsx --env-file=.env src/index.ts", + "scrape:raw": "tsx --env-file=.env src/index.ts raw", + "llm:process": "tsx --env-file=.env src/process.ts", + "import:processed": "tsx --env-file=.env src/import-processed.ts" }, "dependencies": { "@advdoors/shared": "workspace:*", "cheerio": "^1", + "openai": "^4.104.0", "undici": "^7" }, "devDependencies": { diff --git a/apps/scraper/src/config.ts b/apps/scraper/src/config.ts index e56daec..db919ae 100644 --- a/apps/scraper/src/config.ts +++ b/apps/scraper/src/config.ts @@ -21,6 +21,10 @@ export const PAYLOAD_API_URL = process.env.PAYLOAD_API_URL || "http://localhost:3001/api"; export const PAYLOAD_EMAIL = process.env.PAYLOAD_EMAIL || "admin@advdoors.ru"; -export const PAYLOAD_PASSWORD = process.env.PAYLOAD_PASSWORD || ""; +export const PAYLOAD_PASSWORD = process.env.PAYLOAD_PASSWORD || "admin"; export const REQUEST_DELAY_MS = 500; + +export const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY || ""; +export const OPENROUTER_MODEL = + process.env.OPENROUTER_MODEL || "google/gemini-2.0-flash-001"; diff --git a/apps/scraper/src/dump.ts b/apps/scraper/src/dump.ts new file mode 100644 index 0000000..c73ef92 --- /dev/null +++ b/apps/scraper/src/dump.ts @@ -0,0 +1,29 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import type { RawProduct } from "./llm/types.js"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const DATA_DIR = path.resolve(__dirname, "../data/raw"); + +function slugifyPath(text: string): string { + return text + .toLowerCase() + .replace(/[^a-zа-яё0-9]+/gi, "-") + .replace(/^-+|-+$/g, "") + .replace(/-+/g, "-"); +} + +export async function dumpCategory( + categoryName: string, + products: RawProduct[], +): Promise { + await mkdir(DATA_DIR, { recursive: true }); + + const filename = `${slugifyPath(categoryName)}.json`; + const filepath = path.join(DATA_DIR, filename); + + await writeFile(filepath, JSON.stringify(products, null, 2), "utf-8"); + console.log(` Wrote ${products.length} products → ${filepath}`); + return filepath; +} diff --git a/apps/scraper/src/extract.ts b/apps/scraper/src/extract.ts index e020521..553f906 100644 --- a/apps/scraper/src/extract.ts +++ b/apps/scraper/src/extract.ts @@ -6,11 +6,22 @@ export interface ProductDetail { articleNumber: string; price: number; discountPrice: number | null; + discountPercent: number | null; availability: "in-stock" | "made-to-order" | "coming-soon"; + frostResistance: number; shortDescription: string; technicalSpecs: string; + bodyText: string; imageUrls: string[]; options: Array<{ name: string; priceModifier: number; description: string }>; + sizeOptions: string[]; + directionOptions: string[]; + producer: string | null; +} + +function parsePrice(text: string): number { + const digits = text.replace(/[^\d]/g, ""); + return digits ? parseInt(digits, 10) : 0; } export async function extractProduct(url: string): Promise { @@ -21,40 +32,69 @@ export async function extractProduct(url: string): Promise { const name = $("h1").first().text().trim(); - const bodyText = $("body").text(); - const allPrices = [...bodyText.matchAll(/(\d[\d\s]*)\s*(?:руб|₽|РУБ)/gi)].map( - (m) => parseInt(m[1].replace(/\s/g, ""), 10), + // --- Prices from .item_price DOM --- + // 71070 руб. = current/discounted price + // 99000 руб. = original price + // = cart price fallback + const strongPrice = parsePrice( + $(".item_price .amount strong, .item_price strong").first().text(), + ); + const smallPrice = parsePrice($(".item_price small").first().text()); + const cartPrice = parsePrice( + String($("input[name=cpr]").first().val() || ""), ); - const validPrices = allPrices.filter((p) => p > 1000); - let price = validPrices[0] || 0; + let price = 0; let discountPrice: number | null = null; - if (validPrices.length >= 2 && validPrices[1] < validPrices[0]) { - price = validPrices[0]; - discountPrice = validPrices[1]; + if (strongPrice && smallPrice) { + price = smallPrice; + discountPrice = strongPrice; + } else if (strongPrice) { + price = strongPrice; + } else { + price = cartPrice; } - const availText = bodyText.toLowerCase(); - const availability: ProductDetail["availability"] = availText.includes( - "в наличии", - ) - ? "in-stock" - : availText.includes("на заказ") - ? "made-to-order" - : "in-stock"; + // --- Badges from #prodimg overlay divs --- + const badgeDivs = $("#prodimg div"); + let availability: ProductDetail["availability"] = "in-stock"; + let discountPercent: number | null = null; - let technicalSpecs = ""; - const specHeaders = $("h3, h4, strong, b").filter( - (_i, el) => - $(el).text().toLowerCase().includes("техническое") || - $(el).text().toLowerCase().includes("описание"), - ); - if (specHeaders.length > 0) { - const specParent = specHeaders.first().parent(); - technicalSpecs = specParent.text().trim().slice(0, 5000); + badgeDivs.each((_i, el) => { + const text = $(el).text().trim(); + const lower = text.toLowerCase(); + if (lower === "на заказ") availability = "made-to-order"; + else if (lower === "скоро") availability = "coming-soon"; + else if (lower === "в наличии") availability = "in-stock"; + + const discMatch = text.match(/^-(\d+)%$/); + if (discMatch) discountPercent = parseInt(discMatch[1], 10); + }); + + // --- Frost resistance from snowflake icon --- + let frostResistance = 0; + const flakeImg = $("#prodimg img[src*='flake']").first(); + if (flakeImg.length) { + const m = flakeImg.attr("src")?.match(/flake(\d)/); + if (m) frostResistance = parseInt(m[1], 10); } + // --- Size options from radio buttons --- + const sizeOptions: string[] = []; + $("input[name=size]").each((_i, el) => { + const val = $(el).attr("value"); + if (val && !sizeOptions.includes(val)) sizeOptions.push(val); + }); + + // --- Direction (orientation) options from radio buttons --- + const directionOptions: string[] = []; + $("input[name=direction]").each((_i, el) => { + const val = $(el).attr("value"); + if (val && !directionOptions.includes(val)) directionOptions.push(val); + }); + + // --- Images (keep existing logic) --- const imageUrls: string[] = []; const seenPaths = new Set(); const resizePrefixRe = /^\/[fi]w?\d+(?:h\d+)?\//; @@ -65,23 +105,27 @@ export async function extractProduct(url: string): Promise { function addImage(raw: string): void { if (!raw) return; - if (raw.includes("logo") || raw.includes("icon") || raw.includes("banner") || raw.includes("fav")) return; - if (!raw.includes("/pages/photos/") && !raw.includes("/pages/catalog/")) return; + if ( + raw.includes("logo") || + raw.includes("icon") || + raw.includes("banner") || + raw.includes("fav") + ) + return; + if (!raw.includes("/pages/photos/") && !raw.includes("/pages/catalog/")) + return; const canonical = normalizeImagePath(raw); if (seenPaths.has(canonical)) return; seenPaths.add(canonical); const highRes = `/iw800${canonical}`; - const fullUrl = `${BASE_URL}${highRes}`; - imageUrls.push(fullUrl); + imageUrls.push(`${BASE_URL}${highRes}`); } $("a[href]").each((_i, el) => { const href = $(el).attr("href"); - if (href && /\.(jpe?g|png|webp)$/i.test(href)) { - addImage(href); - } + if (href && /\.(jpe?g|png|webp)$/i.test(href)) addImage(href); }); $("img").each((_i, el) => { @@ -89,33 +133,118 @@ export async function extractProduct(url: string): Promise { if (src) addImage(src); }); + // --- Paid options from
    after "Платные опции:" heading --- const options: ProductDetail["options"] = []; - const optionMatches = [ - ...bodyText.matchAll( - /([^:•\n]+?):\s*\+?\s*(\d[\d\s.]*)\s*(?:рублей|руб)/gi, - ), - ]; - for (const match of optionMatches) { - const optName = match[1].trim(); - const optPrice = parseInt(match[2].replace(/[\s.]/g, ""), 10); - if (optName.length > 3 && optName.length < 100 && optPrice > 0) { - options.push({ - name: optName, - priceModifier: optPrice, - description: "", - }); + $(".product_inf1 strong, .product_inf1 b, .product_inf1 p").each((_i, el) => { + if (options.length > 0) return false; + if (!$(el).text().includes("Платные опции")) return; + + let block = $(el); + while (block.length && block.is("strong, b, em, span, a, i")) { + block = block.parent(); } + + const ul = block.nextAll("ul").first(); + if (!ul.length) return; + + ul.find("li").each((_j, li) => { + const text = $(li).text().trim(); + if (!text || text.length < 3) return; + + const priceMatch = text.match(/\+\s*([\d\s.,]+)\s*рубл/i); + const percentMatch = text.match(/\+\s*(\d+)\s*%/); + + let priceModifier = 0; + let description = ""; + + if (priceMatch) { + priceModifier = parseFloat( + priceMatch[1].replace(/\s/g, "").replace(",", "."), + ); + } else if (percentMatch) { + description = `+${percentMatch[1]}%`; + } + + const optName = text + .replace(/\+\s*[\d\s.,]+\s*(?:рублей|рубля|руб\.?)/gi, "") + .replace(/\+\s*\d+\s*%/g, "") + .replace(/[+:]+\s*$/, "") + .trim(); + + if (optName.length > 2) { + options.push({ name: optName, priceModifier, description }); + } + }); + }); + + // --- Remove script/style/noscript before text-based extractions --- + $("script, style, noscript").remove(); + + // --- Technical specs from #zz2 --- + let technicalSpecs = ""; + const zz2 = $("#zz2"); + if (zz2.length) { + technicalSpecs = zz2.text().trim().slice(0, 5000); + } else { + $("h3, h4").each((_i, el) => { + if (technicalSpecs) return false; + const t = $(el).text().toLowerCase(); + if (t.includes("техническ") || t.includes("характеристик")) { + technicalSpecs = $(el).parent().text().trim().slice(0, 5000); + } + }); } + // --- Short description from "Входит в комплект" section --- + let shortDescription = ""; + $("h3").each((_i, el) => { + if (shortDescription) return false; + if (!$(el).text().includes("Входит в комплект")) return; + + const parts: string[] = []; + let sibling = $(el).next(); + while (sibling.length && !sibling.is("h3")) { + const t = sibling.text().trim(); + if (t) parts.push(t); + sibling = sibling.next(); + } + shortDescription = parts.join("\n").slice(0, 3000); + }); + + // --- Producer from or

    containing "Производитель" --- + let producer: string | null = null; + $(".product_inf1 strong, .product_inf1 p").each((_i, el) => { + if (producer) return false; + const text = $(el).text().trim(); + if (!text.includes("Производитель")) return; + const m = text.match(/Производитель\s+(.+)/i); + if (m) { + producer = m[1].trim().replace(/\s+/g, " ").slice(0, 100) || null; + } + }); + + // --- Body text (cleaned, for LLM context) --- + const bodyText = $("body") + .text() + .replace(/\s+/g, " ") + .trim() + .slice(0, 15_000); + return { name, articleNumber, price, discountPrice, + discountPercent, availability, - shortDescription: "", + frostResistance, + shortDescription, technicalSpecs, + bodyText, imageUrls, options, + sizeOptions, + directionOptions, + producer, }; } diff --git a/apps/scraper/src/import-processed.ts b/apps/scraper/src/import-processed.ts new file mode 100644 index 0000000..d020574 --- /dev/null +++ b/apps/scraper/src/import-processed.ts @@ -0,0 +1,249 @@ +import { readdir, readFile } from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { REQUEST_DELAY_MS } from "./config.js"; +import { downloadImage } from "./download-media.js"; +import { + login, + findOrCreateCategory, + createProduct, + uploadMedia, +} from "./import.js"; +import type { + RawProduct, + ProcessedProductFamily, + ProcessedVariant, +} from "./llm/types.js"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const RAW_DIR = path.resolve(__dirname, "../data/raw"); +const PROCESSED_DIR = path.resolve(__dirname, "../data/processed"); + +const MAX_IMAGES_PER_PRODUCT = 5; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function slugify(text: string): string { + return text + .toLowerCase() + .replace(/[^a-zа-яё0-9]+/gi, "-") + .replace(/^-+|-+$/g, "") + .replace(/-+/g, "-"); +} + +function mergeFamily( + family: ProcessedProductFamily, + rawMap: Map, + categoryId: string, + imageIds: string[], +) { + const { variants } = family; + const firstRaw = rawMap.get(variants[0].articleNumber); + + const prices = variants.map((v) => v.price); + const minIdx = prices.indexOf(Math.min(...prices)); + const cheapest = variants[minIdx]; + + const orientations = new Set( + variants.map((v) => v.attributes.orientation).filter(Boolean), + ); + const mergedOrientation = + orientations.has("left") && orientations.has("right") + ? "universal" + : (orientations.values().next().value ?? null); + + const sizes = [ + ...new Set( + variants + .filter((v) => v.attributes.width && v.attributes.height) + .map((v) => `${v.attributes.width}x${v.attributes.height}`), + ), + ]; + + return { + name: family.baseModelName, + slug: slugify(`${family.baseModelName}-${variants[0].articleNumber}`), + articleNumber: variants[0].articleNumber, + brand: family.brand, + category: categoryId, + price: cheapest.price, + discountPrice: cheapest.discountPrice, + discountPercent: cheapest.discountPercent, + availability: variants.some((v) => v.availability === "in-stock") + ? "in-stock" + : variants[0].availability, + orientation: mergedOrientation, + sizeOptions: sizes.length > 0 ? sizes : undefined, + frostResistance: Math.max(...variants.map((v) => v.frostResistance)), + color: + variants.find((v) => v.attributes.color)?.attributes.color ?? null, + material: + variants.find((v) => v.attributes.material)?.attributes.material ?? null, + glassType: + variants.find((v) => v.attributes.glassType)?.attributes.glassType ?? + null, + producer: + variants.find((v) => v.attributes.producer)?.attributes.producer ?? null, + shortDescription: firstRaw?.shortDescription, + technicalSpecs: firstRaw?.technicalSpecs, + options: firstRaw?.rawOptions, + images: imageIds.length > 0 ? imageIds : undefined, + variants: variants.map((v) => ({ + articleNumber: v.articleNumber, + name: v.originalName, + width: v.attributes.width, + height: v.attributes.height, + orientation: v.attributes.orientation, + color: v.attributes.color, + price: v.price, + discountPrice: v.discountPrice, + availability: v.availability, + sourceUrl: v.sourceUrl, + })), + }; +} + +function collectUniqueImageUrls(variants: ProcessedVariant[]): string[] { + const seen = new Set(); + const urls: string[] = []; + for (const v of variants) { + for (const url of v.imageUrls) { + if (!seen.has(url)) { + seen.add(url); + urls.push(url); + } + } + } + return urls; +} + +async function downloadAndUploadImages( + imageUrls: string[], + familyName: string, + canonicalArticle: string, +): Promise { + const ids: string[] = []; + const limited = imageUrls.slice(0, MAX_IMAGES_PER_PRODUCT); + + for (let i = 0; i < limited.length; i++) { + const img = await downloadImage(limited[i], canonicalArticle, i); + if (img) { + const alt = `${familyName} — фото ${i + 1}`; + const mediaId = await uploadMedia(img.buffer, img.filename, img.contentType, alt); + if (mediaId) ids.push(mediaId); + } + if (i < limited.length - 1) await sleep(REQUEST_DELAY_MS); + } + return ids; +} + +async function importProcessed(): Promise { + console.log("=== ADVdoors — Import Processed Data ===\n"); + + let processedFiles: string[]; + try { + processedFiles = (await readdir(PROCESSED_DIR)).filter((f) => + f.endsWith(".json"), + ); + } catch { + console.error(`No processed data at ${PROCESSED_DIR}. Run 'llm-process' first.`); + process.exit(1); + } + + if (processedFiles.length === 0) { + console.error(`No JSON files in ${PROCESSED_DIR}. Run 'llm-process' first.`); + process.exit(1); + } + + console.log(`Found ${processedFiles.length} processed category files`); + await login(); + + const stats = { categories: 0, products: 0, images: 0, skipped: 0, errors: 0 }; + + for (const file of processedFiles) { + const procPath = path.join(PROCESSED_DIR, file); + const rawPath = path.join(RAW_DIR, file); + + console.log(`\n--- Importing: ${file} ---`); + + let families: ProcessedProductFamily[]; + try { + families = JSON.parse(await readFile(procPath, "utf-8")); + } catch (error) { + console.error(` Failed to read ${procPath}:`, error); + stats.errors++; + continue; + } + + const rawMap = new Map(); + try { + const rawProducts: RawProduct[] = JSON.parse( + await readFile(rawPath, "utf-8"), + ); + for (const rp of rawProducts) { + rawMap.set(rp.articleNumber, rp); + } + console.log(` Loaded ${rawMap.size} raw products for join`); + } catch { + console.warn(` No matching raw file at ${rawPath}, text fields will be empty`); + } + + for (const family of families) { + if (family.variants.length === 0) { + console.warn(` Skipping empty family: ${family.baseModelName}`); + stats.skipped++; + continue; + } + + try { + const categorySlug = slugify(family.categoryName); + const categoryId = await findOrCreateCategory( + family.categoryName, + categorySlug, + ); + stats.categories++; + + console.log( + `\n Family: ${family.baseModelName} (${family.variants.length} variants)`, + ); + + const uniqueUrls = collectUniqueImageUrls(family.variants); + const imageIds = await downloadAndUploadImages( + uniqueUrls, + family.baseModelName, + family.variants[0].articleNumber, + ); + stats.images += imageIds.length; + + const productData = mergeFamily(family, rawMap, categoryId, imageIds); + const result = await createProduct(productData); + + if (result) { + stats.products++; + } else { + stats.errors++; + } + } catch (error) { + console.error( + ` Error importing family ${family.baseModelName}:`, + error, + ); + stats.errors++; + } + } + } + + console.log("\n=== Import Complete ==="); + console.log(`Category lookups: ${stats.categories}`); + console.log(`Products created: ${stats.products}`); + console.log(`Images uploaded: ${stats.images}`); + console.log(`Families skipped: ${stats.skipped}`); + console.log(`Errors: ${stats.errors}`); +} + +importProcessed().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/apps/scraper/src/import.ts b/apps/scraper/src/import.ts index 2b782cd..639c3ce 100644 --- a/apps/scraper/src/import.ts +++ b/apps/scraper/src/import.ts @@ -156,11 +156,22 @@ export async function createProduct(data: { category?: string; price: number; discountPrice?: number | null; + discountPercent?: number | null; availability: string; + frostResistance?: number; + producer?: string | null; + width?: number | null; + height?: number | null; + material?: string | null; + color?: string | null; + glassType?: string | null; + orientation?: string | null; + sizeOptions?: string[]; shortDescription?: string; technicalSpecs?: string; options?: Array<{ name: string; priceModifier: number; description?: string }>; images?: string[]; + variants?: unknown; }): Promise { try { const existing = await payloadRequest( @@ -173,7 +184,7 @@ export async function createProduct(data: { return existing.docs[0].id; } - const payload: Record = { + const body: Record = { name: data.name, slug: data.slug, articleNumber: data.articleNumber, @@ -182,13 +193,25 @@ export async function createProduct(data: { availability: data.availability, }; - if (data.category) payload.category = data.category; - if (data.discountPrice) payload.discountPrice = data.discountPrice; - if (data.shortDescription) payload.shortDescription = data.shortDescription; - if (data.options?.length) payload.options = data.options; - if (data.images?.length) payload.images = data.images; + if (data.category) body.category = data.category; + if (data.discountPrice) body.discountPrice = data.discountPrice; + if (data.discountPercent) body.discountPercent = data.discountPercent; + if (data.frostResistance) body.frostResistance = data.frostResistance; + if (data.producer) body.producer = data.producer; + if (data.width) body.width = data.width; + if (data.height) body.height = data.height; + if (data.material) body.material = data.material; + if (data.color) body.color = data.color; + if (data.glassType) body.glassType = data.glassType; + if (data.orientation) body.orientation = data.orientation; + if (data.sizeOptions?.length) body.sizeOptions = data.sizeOptions; + if (data.shortDescription) body.shortDescription = data.shortDescription; + if (data.technicalSpecs) body.technicalSpecs = data.technicalSpecs; + if (data.options?.length) body.options = data.options; + if (data.images?.length) body.images = data.images; + if (data.variants) body.variants = data.variants; - const created = await payloadRequest("POST", "/products", payload); + const created = await payloadRequest("POST", "/products", body); console.log(` Created product: ${data.name}`); return created.doc?.id || null; } catch (error) { diff --git a/apps/scraper/src/index.ts b/apps/scraper/src/index.ts index 105b706..f3f9c0e 100644 --- a/apps/scraper/src/index.ts +++ b/apps/scraper/src/index.ts @@ -3,6 +3,8 @@ import { crawlAllPages, type CatalogListItem } from "./crawl.js"; import { extractProduct } from "./extract.js"; import { downloadImage } from "./download-media.js"; import { login, findOrCreateCategory, createProduct, uploadMedia } from "./import.js"; +import { dumpCategory } from "./dump.js"; +import type { RawProduct } from "./llm/types.js"; function slugify(text: string): string { return text @@ -23,8 +25,80 @@ function detectBrand(name: string, fallback: string | null): string { return fallback || "ALAVUS"; } -async function main() { - console.log("=== ADVdoors Scraper ===\n"); +async function scrapeRaw() { + console.log("=== ADVdoors Scraper — Raw Dump Mode ===\n"); + + const stats = { categories: 0, products: 0, errors: 0 }; + + for (const catalogPage of CATALOG_PAGES) { + console.log(`\n--- Crawling: ${catalogPage.category} (${catalogPage.url}) ---`); + + let items: CatalogListItem[]; + try { + items = await crawlAllPages(catalogPage.url); + } catch (error) { + console.error(` Failed to crawl ${catalogPage.url}:`, error); + stats.errors++; + continue; + } + + console.log(` Found ${items.length} products on listing pages`); + + const rawProducts: RawProduct[] = []; + + for (const item of items) { + try { + console.log(` Extracting: ${item.name}`); + const detail = await extractProduct(item.productUrl); + + rawProducts.push({ + sourceUrl: item.productUrl, + categoryUrl: catalogPage.url, + categoryName: catalogPage.category, + categoryBrand: catalogPage.brand, + name: detail.name || item.name, + articleNumber: detail.articleNumber, + price: detail.price || item.price, + discountPrice: detail.discountPrice ?? item.discountPrice, + discountPercent: detail.discountPercent, + availability: detail.availability || item.availability, + frostResistance: detail.frostResistance, + shortDescription: detail.shortDescription, + technicalSpecs: detail.technicalSpecs, + bodyText: detail.bodyText, + imageUrls: detail.imageUrls, + rawOptions: detail.options.map((o) => ({ + name: o.name, + priceModifier: o.priceModifier, + description: o.description, + })), + sizeOptions: detail.sizeOptions, + directionOptions: detail.directionOptions, + producer: detail.producer, + scrapedAt: new Date().toISOString(), + }); + + stats.products++; + } catch (error) { + console.error(` Error extracting ${item.name}:`, error); + stats.errors++; + } + } + + if (rawProducts.length > 0) { + await dumpCategory(catalogPage.category, rawProducts); + stats.categories++; + } + } + + console.log("\n=== Raw Scraping Complete ==="); + console.log(`Categories dumped: ${stats.categories}`); + console.log(`Products scraped: ${stats.products}`); + console.log(`Errors: ${stats.errors}`); +} + +async function scrapeAndImport() { + console.log("=== ADVdoors Scraper — Import Mode ===\n"); await login(); @@ -91,8 +165,13 @@ async function main() { category: categoryId, price: detail.price || item.price, discountPrice: detail.discountPrice || item.discountPrice, + discountPercent: detail.discountPercent, availability: detail.availability || item.availability, + frostResistance: detail.frostResistance, + producer: detail.producer, + sizeOptions: detail.sizeOptions.length > 0 ? detail.sizeOptions : undefined, shortDescription: detail.shortDescription, + technicalSpecs: detail.technicalSpecs, options: detail.options, images: imageIds.length > 0 ? imageIds : undefined, }); @@ -112,7 +191,16 @@ async function main() { console.log(`Errors: ${stats.errors}`); } -main().catch((error) => { - console.error("Fatal error:", error); - process.exit(1); -}); +const command = process.argv[2]; + +if (command === "raw") { + scrapeRaw().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); + }); +} else { + scrapeAndImport().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); + }); +} diff --git a/apps/scraper/src/llm/openrouter.ts b/apps/scraper/src/llm/openrouter.ts new file mode 100644 index 0000000..0102d90 --- /dev/null +++ b/apps/scraper/src/llm/openrouter.ts @@ -0,0 +1,45 @@ +import OpenAI from "openai"; +import { OPENROUTER_API_KEY, OPENROUTER_MODEL } from "../config.js"; + +let client: OpenAI | null = null; + +function getClient(): OpenAI { + if (!client) { + if (!OPENROUTER_API_KEY) { + throw new Error("OPENROUTER_API_KEY is not set"); + } + client = new OpenAI({ + baseURL: "https://openrouter.ai/api/v1", + apiKey: OPENROUTER_API_KEY, + defaultHeaders: { + "HTTP-Referer": "https://advdoors.ru", + "X-Title": "ADVdoors Scraper", + }, + }); + } + return client; +} + +export async function chatJSON( + systemPrompt: string, + userMessage: string, +): Promise { + const ai = getClient(); + + const response = await ai.chat.completions.create({ + model: OPENROUTER_MODEL, + response_format: { type: "json_object" }, + messages: [ + { role: "system", content: systemPrompt }, + { role: "user", content: userMessage }, + ], + temperature: 0.1, + }); + + const text = response.choices[0]?.message?.content; + if (!text) { + throw new Error("Empty response from OpenRouter"); + } + + return JSON.parse(text) as T; +} diff --git a/apps/scraper/src/llm/processor.ts b/apps/scraper/src/llm/processor.ts new file mode 100644 index 0000000..524cec1 --- /dev/null +++ b/apps/scraper/src/llm/processor.ts @@ -0,0 +1,205 @@ +import { readdir, readFile, mkdir, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { chatJSON } from "./openrouter.js"; +import { + GROUPING_SYSTEM_PROMPT, + buildGroupingUserMessage, + EXTRACTION_SYSTEM_PROMPT, + buildExtractionUserMessage, +} from "./prompts.js"; +import type { + RawProduct, + GroupingResult, + AttributeResult, + ProcessedProductFamily, + ProcessedVariant, +} from "./types.js"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const RAW_DIR = path.resolve(__dirname, "../../data/raw"); +const PROCESSED_DIR = path.resolve(__dirname, "../../data/processed"); + +async function loadRawCategory(filepath: string): Promise { + const text = await readFile(filepath, "utf-8"); + return JSON.parse(text) as RawProduct[]; +} + +async function groupProducts( + products: RawProduct[], +): Promise { + const summaries = products.map((p) => ({ + articleNumber: p.articleNumber, + name: p.name, + })); + + console.log(` Grouping ${summaries.length} products...`); + return chatJSON( + GROUPING_SYSTEM_PROMPT, + buildGroupingUserMessage(summaries), + ); +} + +async function extractAttributes( + product: RawProduct, +): Promise { + return chatJSON( + EXTRACTION_SYSTEM_PROMPT, + buildExtractionUserMessage({ + articleNumber: product.articleNumber, + name: product.name, + technicalSpecs: product.technicalSpecs, + bodyText: product.bodyText, + frostResistance: product.frostResistance, + sizeOptions: product.sizeOptions ?? [], + directionOptions: product.directionOptions ?? [], + producer: product.producer ?? null, + }), + ); +} + +function detectBrand(products: RawProduct[]): string { + for (const p of products) { + if (p.categoryBrand) return p.categoryBrand; + } + const first = products[0]; + if (!first) return "ALAVUS"; + const upper = first.name.toUpperCase(); + if (upper.includes("KASKI")) return "KASKI"; + if (upper.includes("ALAVUS")) return "ALAVUS"; + if (upper.includes("SWEDOOR")) return "SWEDOOR"; + if (upper.includes("JELD-WEN")) return "JELD-WEN"; + if (upper.includes("MATTIOVI")) return "MATTIOVI"; + if (upper.includes("ABLOY")) return "ABLOY"; + return "ALAVUS"; +} + +async function processCategory( + filepath: string, +): Promise { + const products = await loadRawCategory(filepath); + const filename = path.basename(filepath, ".json"); + console.log(`\n--- Processing: ${filename} (${products.length} products) ---`); + + if (products.length === 0) return []; + + const grouping = await groupProducts(products); + const productMap = new Map(products.map((p) => [p.articleNumber, p])); + const families: ProcessedProductFamily[] = []; + + for (const group of grouping.groups) { + console.log(` Group: ${group.baseModelName} (${group.articles.length} variants)`); + + const variants: ProcessedVariant[] = []; + + for (const article of group.articles) { + const raw = productMap.get(article); + if (!raw) { + console.warn(` Article ${article} not found in raw data, skipping`); + continue; + } + + try { + console.log(` Extracting attributes for ${article}...`); + const result = await extractAttributes(raw); + + variants.push({ + articleNumber: raw.articleNumber, + originalName: raw.name, + sourceUrl: raw.sourceUrl, + price: raw.price, + discountPrice: raw.discountPrice, + discountPercent: raw.discountPercent ?? null, + availability: raw.availability, + frostResistance: raw.frostResistance ?? 0, + imageUrls: raw.imageUrls, + attributes: result.extractedAttributes, + }); + } catch (error) { + console.error(` Failed to extract attributes for ${article}:`, error); + variants.push({ + articleNumber: raw.articleNumber, + originalName: raw.name, + sourceUrl: raw.sourceUrl, + price: raw.price, + discountPrice: raw.discountPrice, + discountPercent: raw.discountPercent ?? null, + availability: raw.availability, + frostResistance: raw.frostResistance ?? 0, + imageUrls: raw.imageUrls, + attributes: { + width: null, + height: null, + color: null, + colors: { ral: [], rr: [], ncs: [], ttm: [], other: [] }, + orientation: null, + glassType: null, + material: null, + frostResistance: raw.frostResistance ?? 0, + sizeOptions: raw.sizeOptions ?? [], + producer: raw.producer ?? null, + }, + }); + } + } + + const groupProducts = group.articles + .map((a) => productMap.get(a)) + .filter(Boolean) as RawProduct[]; + + families.push({ + baseModelName: group.baseModelName, + brand: detectBrand(groupProducts), + categoryName: products[0]?.categoryName ?? filename, + description: "", + variants, + }); + } + + return families; +} + +export async function processAllCategories(): Promise { + console.log("=== ADVdoors LLM Processor ===\n"); + + let files: string[]; + try { + files = (await readdir(RAW_DIR)).filter((f) => f.endsWith(".json")); + } catch { + console.error(`No raw data found at ${RAW_DIR}. Run 'scrape-raw' first.`); + process.exit(1); + } + + if (files.length === 0) { + console.error(`No JSON files in ${RAW_DIR}. Run 'scrape-raw' first.`); + process.exit(1); + } + + await mkdir(PROCESSED_DIR, { recursive: true }); + + const stats = { files: 0, families: 0, variants: 0, errors: 0 }; + + for (const file of files) { + try { + const filepath = path.join(RAW_DIR, file); + const families = await processCategory(filepath); + + const outPath = path.join(PROCESSED_DIR, file); + await writeFile(outPath, JSON.stringify(families, null, 2), "utf-8"); + console.log(` Wrote ${families.length} families → ${outPath}`); + + stats.files++; + stats.families += families.length; + stats.variants += families.reduce((s, f) => s + f.variants.length, 0); + } catch (error) { + console.error(` Error processing ${file}:`, error); + stats.errors++; + } + } + + console.log("\n=== Processing Complete ==="); + console.log(`Files processed: ${stats.files}`); + console.log(`Product families: ${stats.families}`); + console.log(`Total variants: ${stats.variants}`); + console.log(`Errors: ${stats.errors}`); +} diff --git a/apps/scraper/src/llm/prompts.ts b/apps/scraper/src/llm/prompts.ts new file mode 100644 index 0000000..377794a --- /dev/null +++ b/apps/scraper/src/llm/prompts.ts @@ -0,0 +1,109 @@ +export const GROUPING_SYSTEM_PROMPT = `Ты — эксперт по каталогу финских дверей (KASKI, SWEDOOR/JELD-WEN, ALAVUS, MATTIOVI, ABLOY). + +Тебе будет дан список товаров из одной категории каталога. Многие из них — это варианты одной и той же модели двери, отличающиеся размером, цветом, ориентацией (левая/правая) или другими параметрами. + +Твоя задача — сгруппировать артикулы, которые относятся к одной и той же базовой модели двери. + +Правила: +- Если название отличается только размером (например, 900x2100 vs 1000x2100), цветом (белый/серый/коричневый), ориентацией (лев./прав.) — это одна модель. +- Если двери принципиально разные (разная конструкция, разный тип) — это разные модели. +- Если не уверен — лучше не объединять. +- baseModelName должно быть чистое, человекочитаемое название без размеров/цветов/ориентации. + +Ответь строго в формате JSON: +{ + "groups": [ + { + "baseModelName": "Чистое название модели", + "articles": ["арт1", "арт2", ...] + } + ] +}`; + +export function buildGroupingUserMessage( + products: Array<{ articleNumber: string; name: string }>, +): string { + const lines = products.map( + (p) => `- Артикул: ${p.articleNumber} | Название: ${p.name}`, + ); + return `Категория содержит ${products.length} товаров:\n\n${lines.join("\n")}`; +} + +export const EXTRACTION_SYSTEM_PROMPT = `Ты — эксперт по каталогу финских дверей. + +Тебе будет дана информация о товаре (дверь) со старого сайта. Извлеки структурированные атрибуты. + +Извлекай: +- width: ширина в мм (число или null) +- height: высота в мм (число или null) +- color: основной цвет/покрытие (строка или null), например "белый", "RR32 тёмно-коричневый", "серый RAL 7040" +- colors: объект со списками цветовых кодов, найденных в тексте: + - ral: массив RAL-кодов (например ["RAL 7040", "RAL 7024"]) + - rr: массив кодов Ruukki RR (например ["RR23", "RR32"]) + - ncs: массив кодов NCS (например ["NCS S 0502-Y"]) + - ttm: массив кодов TTM (например ["TTM 0965"]) + - other: массив других цветовых обозначений +- orientation: "left", "right" или "universal" (или null если неизвестно) +- glassType: тип остекления (строка или null), например "стеклопакет", "без стекла", "триплекс" +- material: материал (строка или null), например "сталь/дерево", "массив сосны", "МДФ" +- frostResistance: уровень морозостойкости (число 0-3, берётся из входных данных) +- sizeOptions: массив доступных размеров (берётся из входных данных) +- producer: производитель (строка или null, берётся из входных данных если есть) + +Правила: +- Размеры часто указаны в формате "900x2100" или "9x21" (в этом случае умножь на 100). +- Ориентация может быть указана как "ЛО" (левое открывание) = left, "ПО" (правое) = right. Если доступны оба направления — "universal". +- Если атрибут не удаётся определить — ставь null. +- НЕ выдумывай данные, только то что есть в тексте. +- Для colors ищи ВСЕ упоминания цветовых кодов (RAL, RR, NCS, TTM) по всему тексту, включая описание и опции. +- frostResistance, sizeOptions и producer бери из предоставленных данных, не меняй. + +Ответь строго в формате JSON: +{ + "articleNumber": "...", + "extractedAttributes": { + "width": ..., + "height": ..., + "color": ..., + "colors": { "ral": [...], "rr": [...], "ncs": [...], "ttm": [...], "other": [...] }, + "orientation": ..., + "glassType": ..., + "material": ..., + "frostResistance": ..., + "sizeOptions": [...], + "producer": ... + } +}`; + +export function buildExtractionUserMessage(product: { + articleNumber: string; + name: string; + technicalSpecs: string; + bodyText: string; + frostResistance: number; + sizeOptions: string[]; + directionOptions: string[]; + producer: string | null; +}): string { + const specs = product.technicalSpecs + ? `\nТехнические характеристики:\n${product.technicalSpecs.slice(0, 3000)}` + : ""; + const body = product.bodyText + ? `\nТекст страницы:\n${product.bodyText.slice(0, 5000)}` + : ""; + + const meta = [ + `Морозостойкость (снежинки): ${product.frostResistance}`, + product.sizeOptions.length > 0 + ? `Доступные размеры: ${product.sizeOptions.join(", ")}` + : null, + product.directionOptions.length > 0 + ? `Доступные направления: ${product.directionOptions.join(", ")}` + : null, + product.producer ? `Производитель: ${product.producer}` : null, + ] + .filter(Boolean) + .join("\n"); + + return `Артикул: ${product.articleNumber}\nНазвание: ${product.name}\n${meta}${specs}${body}`; +} diff --git a/apps/scraper/src/llm/types.ts b/apps/scraper/src/llm/types.ts new file mode 100644 index 0000000..fac97f4 --- /dev/null +++ b/apps/scraper/src/llm/types.ts @@ -0,0 +1,77 @@ +export interface RawProduct { + sourceUrl: string; + categoryUrl: string; + categoryName: string; + categoryBrand: string | null; + name: string; + articleNumber: string; + price: number; + discountPrice: number | null; + discountPercent: number | null; + availability: string; + frostResistance: number; + shortDescription: string; + technicalSpecs: string; + bodyText: string; + imageUrls: string[]; + rawOptions: Array<{ name: string; priceModifier: number; description: string }>; + sizeOptions: string[]; + directionOptions: string[]; + producer: string | null; + scrapedAt: string; +} + +export interface GroupingResult { + groups: Array<{ + baseModelName: string; + articles: string[]; + }>; +} + +export interface ColorCodes { + ral: string[]; + rr: string[]; + ncs: string[]; + ttm: string[]; + other: string[]; +} + +export interface ExtractedAttributes { + width: number | null; + height: number | null; + color: string | null; + colors: ColorCodes; + orientation: "left" | "right" | "universal" | null; + glassType: string | null; + material: string | null; + frostResistance: number; + sizeOptions: string[]; + producer: string | null; + [key: string]: unknown; +} + +export interface AttributeResult { + articleNumber: string; + extractedAttributes: ExtractedAttributes; +} + +export interface ProcessedVariant { + articleNumber: string; + originalName: string; + sourceUrl: string; + price: number; + discountPrice: number | null; + discountPercent: number | null; + availability: string; + frostResistance: number; + imageUrls: string[]; + attributes: ExtractedAttributes; +} + +export interface ProcessedProductFamily { + baseModelName: string; + brand: string; + categoryName: string; + description: string; + variants: ProcessedVariant[]; +} diff --git a/apps/scraper/src/process.ts b/apps/scraper/src/process.ts new file mode 100644 index 0000000..4b95b1a --- /dev/null +++ b/apps/scraper/src/process.ts @@ -0,0 +1,6 @@ +import { processAllCategories } from "./llm/processor.js"; + +processAllCategories().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/apps/web/src/app/(frontend)/catalog/page.tsx b/apps/web/src/app/(frontend)/catalog/page.tsx index f81f3b7..8ff918d 100644 --- a/apps/web/src/app/(frontend)/catalog/page.tsx +++ b/apps/web/src/app/(frontend)/catalog/page.tsx @@ -4,7 +4,7 @@ import type { Where } from "payload"; import config from "@payload-config"; import type { Metadata } from "next"; import { ProductCard } from "@/components/ProductCard"; -import { BRANDS } from "@advdoors/shared"; +import { BRANDS, AVAILABILITY_OPTIONS } from "@advdoors/shared"; export const dynamic = "force-dynamic"; @@ -18,6 +18,8 @@ interface CatalogPageProps { page?: string; brand?: string; category?: string; + availability?: string; + frost?: string; q?: string; }>; } @@ -34,6 +36,15 @@ export default async function CatalogPage({ searchParams }: CatalogPageProps) { if (params.category) { conditions.push({ "category.slug": { equals: params.category } }); } + if (params.availability) { + conditions.push({ availability: { equals: params.availability } }); + } + if (params.frost) { + const frostLevel = parseInt(params.frost, 10); + if (frostLevel > 0) { + conditions.push({ frostResistance: { greater_than_equal: frostLevel } }); + } + } if (params.q) { conditions.push({ or: [ @@ -65,6 +76,8 @@ export default async function CatalogPage({ searchParams }: CatalogPageProps) { const next: Record = {}; if (params.brand) next.brand = params.brand; if (params.category) next.category = params.category; + if (params.availability) next.availability = params.availability; + if (params.frost) next.frost = params.frost; if (params.q) next.q = params.q; for (const [k, v] of Object.entries(overrides)) { if (v) next[k] = v; @@ -74,6 +87,46 @@ export default async function CatalogPage({ searchParams }: CatalogPageProps) { return `/catalog${qs ? `?${qs}` : ""}`; } + const activeFilters: { key: string; label: string; clearKey: string }[] = []; + if (params.brand) + activeFilters.push({ + key: "brand", + label: params.brand, + clearKey: "brand", + }); + if (params.category) { + const catName = + categories.find((c) => c.slug === params.category)?.name || + params.category; + activeFilters.push({ + key: "category", + label: catName, + clearKey: "category", + }); + } + if (params.availability) { + const avLabel = + AVAILABILITY_OPTIONS.find((a) => a.value === params.availability) + ?.label || params.availability; + activeFilters.push({ + key: "availability", + label: avLabel, + clearKey: "availability", + }); + } + if (params.frost) + activeFilters.push({ + key: "frost", + label: `Морозостойкость ≥ ${params.frost}`, + clearKey: "frost", + }); + if (params.q) + activeFilters.push({ + key: "q", + label: `«${params.q}»`, + clearKey: "q", + }); + return (

    {/* Breadcrumb */} @@ -97,6 +150,22 @@ export default async function CatalogPage({ searchParams }: CatalogPageProps) {
+ + {/* Availability */} +
+

Наличие

+
    +
  • + + Любое + +
  • + {AVAILABILITY_OPTIONS.map((opt) => ( +
  • + + {opt.label} + +
  • + ))} +
+
+ + {/* Frost resistance */} +
+

+ Морозостойкость +

+
    +
  • + + Любая + +
  • + {[1, 2, 3].map((level) => ( +
  • + + {"❆".repeat(level)} от {level} + +
  • + ))} +
+
{/* Product grid */}
{/* Active filters */} - {(params.brand || params.category || params.q) && ( + {activeFilters.length > 0 && (
- {params.brand && ( + {activeFilters.map((f) => ( - {params.brand} × + {f.label} × - )} - {params.category && ( - - {categories.find((c) => c.slug === params.category)?.name || - params.category}{" "} - × - - )} - {params.q && ( - - “{params.q}” × - - )} + ))} { @@ -73,6 +89,7 @@ export default async function ProductPage({ params }: ProductPageProps) { articleNumber: string; price: number; discountPrice?: number | null; + frostResistance?: number | null; availability: string; images?: unknown[]; }>; @@ -86,8 +103,40 @@ export default async function ProductPage({ params }: ProductPageProps) { availability: product.availability, shortDescription: product.shortDescription, imageUrl: galleryImages[0]?.url, + material: product.material, + color: product.color, + producer: product.producer, }); + const sizeOptions: string[] = Array.isArray(product.sizeOptions) + ? product.sizeOptions + : []; + + const specs: { label: string; value: string }[] = []; + if (product.width && product.height) + specs.push({ label: "Размер", value: `${product.width} × ${product.height} мм` }); + else if (product.width) + specs.push({ label: "Ширина", value: `${product.width} мм` }); + else if (product.height) + specs.push({ label: "Высота", value: `${product.height} мм` }); + if (product.material) + specs.push({ label: "Материал", value: product.material }); + if (product.color) specs.push({ label: "Цвет", value: product.color }); + if (product.glassType) + specs.push({ label: "Остекление", value: product.glassType }); + if (product.orientation) + specs.push({ + label: "Открывание", + value: formatOrientation(product.orientation) || product.orientation, + }); + if (product.producer) + specs.push({ label: "Производитель", value: product.producer }); + if (product.frostResistance && product.frostResistance > 0) + specs.push({ + label: "Морозостойкость", + value: "❆".repeat(product.frostResistance) + ` (${product.frostResistance}/3)`, + }); + return (