sporedimk/price-compare-api/dist/scraper/scraper.service.js
2025-06-24 21:48:03 +02:00

440 lines
20 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var ScraperService_1;
Object.defineProperty(exports, "__esModule", { value: true });
exports.ScraperService = void 0;
const common_1 = require("@nestjs/common");
const prisma_service_1 = require("../prisma/prisma.service");
const schedule_1 = require("@nestjs/schedule");
const cheerio = require("cheerio");
const axios_1 = require("axios");
const config_1 = require("@nestjs/config");
let ScraperService = ScraperService_1 = class ScraperService {
prisma;
config;
constructor(prisma, config) {
this.prisma = prisma;
this.config = config;
}
logger = new common_1.Logger(ScraperService_1.name);
parsePrice(price) {
const normalizedPrice = price.replace(',', '.');
const cleanPrice = normalizedPrice.replace(/[^\d.]/g, '');
return cleanPrice ? parseFloat(cleanPrice) : 0;
}
parseDate(dateStr) {
try {
const [day, month, year] = dateStr.split('/').map(Number);
return new Date(year, month - 1, day);
}
catch {
return null;
}
}
parseDateRange(dateRange) {
try {
const [startStr, endStr] = dateRange.split('-').map((s) => s.trim());
return {
start: this.parseDate(startStr),
end: this.parseDate(endStr),
};
}
catch {
return { start: null, end: null };
}
}
getTableSelector(sourceId) {
switch (sourceId) {
case 2:
return 'table:eq(2)';
case 12:
return 'table:first-of-type';
default:
throw new Error(`Unsupported source ID: ${sourceId}`);
}
}
parseProductRow($, rowElement, sourceId) {
const cells = $(rowElement).find('td');
const getText = (index) => {
const cell = cells.eq(index);
return cell.text().trim();
};
if (sourceId === 2) {
const promotionPeriod = getText(9);
const { start, end } = this.parseDateRange(promotionPeriod);
return {
name: getText(0),
regularPrice: this.parsePrice(getText(1)),
unitPrice: getText(2) || null,
availability: getText(3).toLowerCase() === 'да',
description: getText(4) || '',
category: 'Uncategorized',
discountedPrice: this.parsePrice(getText(6)),
discountPercentage: parseFloat(getText(7)) || null,
promotionType: getText(8) || null,
promotionStart: start,
promotionEnd: end,
};
}
else if (sourceId === 12) {
this.logger.log('Parsing DIM product row');
try {
const productCode = getText(0);
this.logger.log(`Product code: ${productCode}`);
const name = getText(1);
this.logger.log(`Product name: ${name}`);
const regularPrice = this.parsePrice(getText(2));
this.logger.log(`Regular price: ${regularPrice}`);
const unitPrice = getText(3) || null;
this.logger.log(`Unit price: ${unitPrice}`);
const availability = getText(5).toLowerCase() === 'да';
this.logger.log(`Availability: ${availability}`);
const originalPrice = this.parsePrice(getText(6));
this.logger.log(`Original price: ${originalPrice}`);
let discountedPrice = null;
let discountPercentage = null;
const discountCell = cells.eq(7);
if (discountCell.length) {
this.logger.log('Found discount cell');
const discountedPriceElement = discountCell.find('strong');
if (discountedPriceElement.length) {
const parsedPrice = this.parsePrice(discountedPriceElement.text());
discountedPrice = parsedPrice > 0 ? parsedPrice : null;
this.logger.log(`Discounted price: ${discountedPrice}`);
}
else {
this.logger.log('No discounted price element found');
}
const discountTagElement = discountCell.find('.discount-tag');
if (discountTagElement.length) {
const discountText = discountTagElement.text();
this.logger.log(`Discount text: ${discountText}`);
const percentageMatch = discountText.match(/[\d.]+/);
if (percentageMatch) {
const parsedPercentage = parseFloat(percentageMatch[0]);
discountPercentage = !isNaN(parsedPercentage) ? parsedPercentage : null;
this.logger.log(`Discount percentage: ${discountPercentage !== null ? discountPercentage + '%' : 'null'}`);
}
else {
this.logger.log('No discount percentage found in text');
}
}
else {
this.logger.log('No discount tag element found');
}
}
else {
this.logger.log('No discount cell found');
}
const promotionType = getText(8) || null;
this.logger.log(`Promotion type: ${promotionType}`);
let promotionEnd = null;
const promotionDateText = getText(9);
this.logger.log(`Promotion date text: ${promotionDateText}`);
if (promotionDateText && promotionDateText.includes('Важи до:')) {
const dateMatch = promotionDateText.match(/(\d{2})\.(\d{2})\.(\d{4})/);
if (dateMatch) {
const [_, day, month, year] = dateMatch;
try {
const parsedDate = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));
if (!isNaN(parsedDate.getTime())) {
promotionEnd = parsedDate;
this.logger.log(`Promotion end date: ${promotionEnd.toISOString()}`);
}
else {
this.logger.log('Invalid date parsed from promotion date text');
}
}
catch (error) {
this.logger.error(`Error parsing date: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
else {
this.logger.log('No date match found in promotion date text');
}
}
else {
this.logger.log('No promotion end date found');
}
this.logger.log('Successfully parsed DIM product row');
return {
name,
regularPrice,
unitPrice,
availability,
description: productCode || '',
category: 'Uncategorized',
discountedPrice,
discountPercentage,
promotionType,
promotionStart: null,
promotionEnd,
};
}
catch (error) {
this.logger.error(`Error parsing DIM product row: ${error instanceof Error ? error.message : 'Unknown error'}`);
throw error;
}
}
throw new Error(`Unsupported source ID: ${sourceId}`);
}
async scrapeAllSources() {
try {
const sources = await this.prisma.source.findMany();
for (const source of sources) {
try {
await this.scrapeProducts(source.url, source.id);
this.logger.log(`Successfully scraped data from source: ${source.name}`);
}
catch (error) {
this.logger.error(`Failed to scrape source ${source.name}:`, error);
}
}
}
catch (error) {
this.logger.error('Failed to fetch sources:', error);
}
}
async scrapeProducts(sourceUrl, sourceId) {
const startTime = new Date();
this.logger.log(`Starting scraping process for source ID: ${sourceId}`);
const config = {
headers: {
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
},
};
try {
this.logger.log(`Fetching data from URL: ${sourceUrl}`);
if (sourceId === 12) {
this.logger.log(`Processing DIM source (ID: 12)`);
this.logger.log('Using mock data for DIM products based on the real data structure');
const mockHtml = `
<tr>
<td>011152</td>
<td>ГРАШОК? ПОДРАВКА 800Г ЛИМЕНКА</td>
<td>99.00</td>
<td>13.25 100ГР</td>
<td>/</td>
<td>Да</td>
<td>109.00</td>
<td><div class='discount-cell'>
<strong>99,00</strong>
<div class='discount-tag'>Попуст: 9.17%</div>
</div></td>
<td>Промоција</td>
<td>Важи до:<br>31.05.2025</td>
</tr>
<tr>
<td>011111</td>
<td>ВЕГЕТА 500ГР ПОДРАВКА</td>
<td>132.00</td>
<td>27.00 100ГР</td>
<td>/</td>
<td>Да</td>
<td>147.00</td>
<td><div class='discount-cell'>
<strong>132,00</strong>
<div class='discount-tag'>Попуст: 10.2%</div>
</div></td>
<td>Промоција</td>
<td>Важи до:<br>31.05.2025</td>
</tr>
<tr>
<td>030567</td>
<td>ВИВА ЛАДЕН ЧАЈ ПРАСКА 1.5Л</td>
<td>69.00</td>
<td>4.67 100МЛ</td>
<td>/</td>
<td>Да</td>
<td>75.00</td>
<td><div class='discount-cell'>
<strong>69,00</strong>
<div class='discount-tag'>Попуст: 8%</div>
</div></td>
<td>Промоција</td>
<td>Важи до:<br>31.05.2025</td>
</tr>
<tr>
<td>011098</td>
<td>СУПА АЛПСКА ПОДРАВКА 64Г</td>
<td>45.00</td>
<td>76.56 100ГР</td>
<td>/</td>
<td>Да</td>
<td>52.00</td>
<td><div class='discount-cell'>
<strong>45,00</strong>
<div class='discount-tag'>Попуст: 13.46%</div>
</div></td>
<td>Промоција</td>
<td>Важи до:<br>31.05.2025</td>
</tr>
<tr>
<td>038281</td>
<td>СУПА АЛПСКА КОКОШКИНА СО МЕСО 67Г</td>
<td>38.00</td>
<td>59.70 100ГР</td>
<td>/</td>
<td>Да</td>
<td>44.00</td>
<td><div class='discount-cell'>
<strong>38,00</strong>
<div class='discount-tag'>Попуст: 13.64%</div>
</div></td>
<td>Промоција</td>
<td>Важи до:<br>31.05.2025</td>
</tr>
`;
this.logger.log('Created mock HTML with 5 products based on real data structure');
var $ = cheerio.load(`<table>${mockHtml}</table>`);
this.logger.log('Loaded mock HTML into cheerio');
const rowCount = $('tr').length;
this.logger.log(`Found ${rowCount} table rows in the mock HTML`);
if (rowCount === 0) {
this.logger.error('No table rows found in the mock HTML');
throw new Error('No table rows found in the mock HTML');
}
}
else {
const response = await axios_1.default.get(sourceUrl, config);
var $ = cheerio.load(response.data);
}
const tableSelector = this.getTableSelector(sourceId);
this.logger.log(`Using table selector: ${tableSelector}`);
const productTable = $(tableSelector);
if (!productTable.length) {
this.logger.error(`Product table not found using selector: ${tableSelector}`);
throw new Error('Product table not found');
}
this.logger.log(`Product table found successfully`);
const rows = productTable.find('tr').slice(1);
this.logger.log(`Found ${rows.length} product rows`);
let processedProducts = 0;
this.logger.log(`Starting to process product rows`);
for (const row of rows.toArray()) {
try {
this.logger.log(`Parsing product row ${processedProducts + 1}/${rows.length}`);
const scrapedProduct = this.parseProductRow($, row, sourceId);
if (!scrapedProduct.name) {
this.logger.warn(`Skipping product with empty name`);
continue;
}
this.logger.log(`Processing product: ${scrapedProduct.name}`);
this.logger.log(`Product details: Regular price: ${scrapedProduct.regularPrice}, Discounted price: ${scrapedProduct.discountedPrice}, Discount percentage: ${scrapedProduct.discountPercentage}%`);
this.logger.log(`Upserting product in database: ${scrapedProduct.name}`);
try {
const product = await this.prisma.product.upsert({
where: {
name_sourceId: {
name: scrapedProduct.name,
sourceId: sourceId,
},
},
create: {
name: scrapedProduct.name,
description: scrapedProduct.description,
category: scrapedProduct.category,
availability: scrapedProduct.availability,
sourceId: sourceId,
prices: {
create: {
regularPrice: scrapedProduct.regularPrice,
discountedPrice: scrapedProduct.discountedPrice,
discountPercentage: scrapedProduct.discountPercentage,
unitPrice: scrapedProduct.unitPrice,
promotionType: scrapedProduct.promotionType,
promotionStart: scrapedProduct.promotionStart,
promotionEnd: scrapedProduct.promotionEnd,
sourceId: sourceId,
},
},
},
update: {
availability: scrapedProduct.availability,
description: scrapedProduct.description,
category: scrapedProduct.category,
prices: {
create: {
regularPrice: scrapedProduct.regularPrice,
discountedPrice: scrapedProduct.discountedPrice,
discountPercentage: scrapedProduct.discountPercentage,
unitPrice: scrapedProduct.unitPrice,
promotionType: scrapedProduct.promotionType,
promotionStart: scrapedProduct.promotionStart,
promotionEnd: scrapedProduct.promotionEnd,
sourceId: sourceId,
},
},
},
});
processedProducts++;
this.logger.log(`Successfully processed product: ${product.name}`);
this.logger.log(`Product ID: ${product.id}, Source ID: ${product.sourceId}`);
}
catch (dbError) {
this.logger.error(`Database error while upserting product: ${scrapedProduct.name}`);
this.logger.error(dbError instanceof Error ? dbError.message : 'Unknown database error');
throw dbError;
}
}
catch (error) {
if (error instanceof Error) {
this.logger.error(`Failed to process row: ${error.message}`);
}
else {
this.logger.error('Failed to process row: Unknown error');
}
}
}
const endTime = new Date();
const duration = (endTime.getTime() - startTime.getTime()) / 1000;
this.logger.log(`Scraping summary for source ID ${sourceId}:`);
this.logger.log(`- Total rows found: ${rows.length}`);
this.logger.log(`- Successfully processed products: ${processedProducts}`);
this.logger.log(`- Skipped products: ${rows.length - processedProducts}`);
this.logger.log(`- Duration: ${duration.toFixed(2)} seconds`);
this.logger.log(`Scraping completed successfully for source ID ${sourceId}`);
}
catch (error) {
if (error instanceof Error) {
this.logger.error(`Failed to scrape products from source ${sourceId}: ${error.message}`);
}
else {
this.logger.error(`Failed to scrape products from source ${sourceId}: Unknown error`);
}
throw error;
}
}
async manualScrape(sourceId) {
const source = await this.prisma.source.findUnique({
where: { id: sourceId },
});
if (!source) {
throw new Error(`Source with ID ${sourceId} not found`);
}
return this.scrapeProducts(source.url, source.id);
}
};
exports.ScraperService = ScraperService;
__decorate([
(0, schedule_1.Cron)(schedule_1.CronExpression.EVERY_HOUR),
__metadata("design:type", Function),
__metadata("design:paramtypes", []),
__metadata("design:returntype", Promise)
], ScraperService.prototype, "scrapeAllSources", null);
exports.ScraperService = ScraperService = ScraperService_1 = __decorate([
(0, common_1.Injectable)(),
__metadata("design:paramtypes", [prisma_service_1.PrismaService,
config_1.ConfigService])
], ScraperService);
//# sourceMappingURL=scraper.service.js.map