440 lines
20 KiB
JavaScript
440 lines
20 KiB
JavaScript
"use strict";
|
||
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
|
||
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
||
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
||
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
||
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
||
};
|
||
var __metadata = (this && this.__metadata) || function (k, v) {
|
||
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
|
||
};
|
||
var ScraperService_1;
|
||
Object.defineProperty(exports, "__esModule", { value: true });
|
||
exports.ScraperService = void 0;
|
||
const common_1 = require("@nestjs/common");
|
||
const prisma_service_1 = require("../prisma/prisma.service");
|
||
const schedule_1 = require("@nestjs/schedule");
|
||
const cheerio = require("cheerio");
|
||
const axios_1 = require("axios");
|
||
const config_1 = require("@nestjs/config");
|
||
let ScraperService = ScraperService_1 = class ScraperService {
|
||
prisma;
|
||
config;
|
||
constructor(prisma, config) {
|
||
this.prisma = prisma;
|
||
this.config = config;
|
||
}
|
||
logger = new common_1.Logger(ScraperService_1.name);
|
||
parsePrice(price) {
|
||
const normalizedPrice = price.replace(',', '.');
|
||
const cleanPrice = normalizedPrice.replace(/[^\d.]/g, '');
|
||
return cleanPrice ? parseFloat(cleanPrice) : 0;
|
||
}
|
||
parseDate(dateStr) {
|
||
try {
|
||
const [day, month, year] = dateStr.split('/').map(Number);
|
||
return new Date(year, month - 1, day);
|
||
}
|
||
catch {
|
||
return null;
|
||
}
|
||
}
|
||
parseDateRange(dateRange) {
|
||
try {
|
||
const [startStr, endStr] = dateRange.split('-').map((s) => s.trim());
|
||
return {
|
||
start: this.parseDate(startStr),
|
||
end: this.parseDate(endStr),
|
||
};
|
||
}
|
||
catch {
|
||
return { start: null, end: null };
|
||
}
|
||
}
|
||
getTableSelector(sourceId) {
|
||
switch (sourceId) {
|
||
case 2:
|
||
return 'table:eq(2)';
|
||
case 12:
|
||
return 'table:first-of-type';
|
||
default:
|
||
throw new Error(`Unsupported source ID: ${sourceId}`);
|
||
}
|
||
}
|
||
parseProductRow($, rowElement, sourceId) {
|
||
const cells = $(rowElement).find('td');
|
||
const getText = (index) => {
|
||
const cell = cells.eq(index);
|
||
return cell.text().trim();
|
||
};
|
||
if (sourceId === 2) {
|
||
const promotionPeriod = getText(9);
|
||
const { start, end } = this.parseDateRange(promotionPeriod);
|
||
return {
|
||
name: getText(0),
|
||
regularPrice: this.parsePrice(getText(1)),
|
||
unitPrice: getText(2) || null,
|
||
availability: getText(3).toLowerCase() === 'да',
|
||
description: getText(4) || '',
|
||
category: 'Uncategorized',
|
||
discountedPrice: this.parsePrice(getText(6)),
|
||
discountPercentage: parseFloat(getText(7)) || null,
|
||
promotionType: getText(8) || null,
|
||
promotionStart: start,
|
||
promotionEnd: end,
|
||
};
|
||
}
|
||
else if (sourceId === 12) {
|
||
this.logger.log('Parsing DIM product row');
|
||
try {
|
||
const productCode = getText(0);
|
||
this.logger.log(`Product code: ${productCode}`);
|
||
const name = getText(1);
|
||
this.logger.log(`Product name: ${name}`);
|
||
const regularPrice = this.parsePrice(getText(2));
|
||
this.logger.log(`Regular price: ${regularPrice}`);
|
||
const unitPrice = getText(3) || null;
|
||
this.logger.log(`Unit price: ${unitPrice}`);
|
||
const availability = getText(5).toLowerCase() === 'да';
|
||
this.logger.log(`Availability: ${availability}`);
|
||
const originalPrice = this.parsePrice(getText(6));
|
||
this.logger.log(`Original price: ${originalPrice}`);
|
||
let discountedPrice = null;
|
||
let discountPercentage = null;
|
||
const discountCell = cells.eq(7);
|
||
if (discountCell.length) {
|
||
this.logger.log('Found discount cell');
|
||
const discountedPriceElement = discountCell.find('strong');
|
||
if (discountedPriceElement.length) {
|
||
const parsedPrice = this.parsePrice(discountedPriceElement.text());
|
||
discountedPrice = parsedPrice > 0 ? parsedPrice : null;
|
||
this.logger.log(`Discounted price: ${discountedPrice}`);
|
||
}
|
||
else {
|
||
this.logger.log('No discounted price element found');
|
||
}
|
||
const discountTagElement = discountCell.find('.discount-tag');
|
||
if (discountTagElement.length) {
|
||
const discountText = discountTagElement.text();
|
||
this.logger.log(`Discount text: ${discountText}`);
|
||
const percentageMatch = discountText.match(/[\d.]+/);
|
||
if (percentageMatch) {
|
||
const parsedPercentage = parseFloat(percentageMatch[0]);
|
||
discountPercentage = !isNaN(parsedPercentage) ? parsedPercentage : null;
|
||
this.logger.log(`Discount percentage: ${discountPercentage !== null ? discountPercentage + '%' : 'null'}`);
|
||
}
|
||
else {
|
||
this.logger.log('No discount percentage found in text');
|
||
}
|
||
}
|
||
else {
|
||
this.logger.log('No discount tag element found');
|
||
}
|
||
}
|
||
else {
|
||
this.logger.log('No discount cell found');
|
||
}
|
||
const promotionType = getText(8) || null;
|
||
this.logger.log(`Promotion type: ${promotionType}`);
|
||
let promotionEnd = null;
|
||
const promotionDateText = getText(9);
|
||
this.logger.log(`Promotion date text: ${promotionDateText}`);
|
||
if (promotionDateText && promotionDateText.includes('Важи до:')) {
|
||
const dateMatch = promotionDateText.match(/(\d{2})\.(\d{2})\.(\d{4})/);
|
||
if (dateMatch) {
|
||
const [_, day, month, year] = dateMatch;
|
||
try {
|
||
const parsedDate = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));
|
||
if (!isNaN(parsedDate.getTime())) {
|
||
promotionEnd = parsedDate;
|
||
this.logger.log(`Promotion end date: ${promotionEnd.toISOString()}`);
|
||
}
|
||
else {
|
||
this.logger.log('Invalid date parsed from promotion date text');
|
||
}
|
||
}
|
||
catch (error) {
|
||
this.logger.error(`Error parsing date: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||
}
|
||
}
|
||
else {
|
||
this.logger.log('No date match found in promotion date text');
|
||
}
|
||
}
|
||
else {
|
||
this.logger.log('No promotion end date found');
|
||
}
|
||
this.logger.log('Successfully parsed DIM product row');
|
||
return {
|
||
name,
|
||
regularPrice,
|
||
unitPrice,
|
||
availability,
|
||
description: productCode || '',
|
||
category: 'Uncategorized',
|
||
discountedPrice,
|
||
discountPercentage,
|
||
promotionType,
|
||
promotionStart: null,
|
||
promotionEnd,
|
||
};
|
||
}
|
||
catch (error) {
|
||
this.logger.error(`Error parsing DIM product row: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||
throw error;
|
||
}
|
||
}
|
||
throw new Error(`Unsupported source ID: ${sourceId}`);
|
||
}
|
||
async scrapeAllSources() {
|
||
try {
|
||
const sources = await this.prisma.source.findMany();
|
||
for (const source of sources) {
|
||
try {
|
||
await this.scrapeProducts(source.url, source.id);
|
||
this.logger.log(`Successfully scraped data from source: ${source.name}`);
|
||
}
|
||
catch (error) {
|
||
this.logger.error(`Failed to scrape source ${source.name}:`, error);
|
||
}
|
||
}
|
||
}
|
||
catch (error) {
|
||
this.logger.error('Failed to fetch sources:', error);
|
||
}
|
||
}
|
||
async scrapeProducts(sourceUrl, sourceId) {
|
||
const startTime = new Date();
|
||
this.logger.log(`Starting scraping process for source ID: ${sourceId}`);
|
||
const config = {
|
||
headers: {
|
||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Language': 'en-US,en;q=0.5',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
|
||
},
|
||
};
|
||
try {
|
||
this.logger.log(`Fetching data from URL: ${sourceUrl}`);
|
||
if (sourceId === 12) {
|
||
this.logger.log(`Processing DIM source (ID: 12)`);
|
||
this.logger.log('Using mock data for DIM products based on the real data structure');
|
||
const mockHtml = `
|
||
<tr>
|
||
<td>011152</td>
|
||
<td>ГРАШОК? ПОДРАВКА 800Г ЛИМЕНКА</td>
|
||
<td>99.00</td>
|
||
<td>13.25 100ГР</td>
|
||
<td>/</td>
|
||
<td>Да</td>
|
||
<td>109.00</td>
|
||
<td><div class='discount-cell'>
|
||
<strong>99,00</strong>
|
||
<div class='discount-tag'>Попуст: 9.17%</div>
|
||
</div></td>
|
||
<td>Промоција</td>
|
||
<td>Важи до:<br>31.05.2025</td>
|
||
</tr>
|
||
<tr>
|
||
<td>011111</td>
|
||
<td>ВЕГЕТА 500ГР ПОДРАВКА</td>
|
||
<td>132.00</td>
|
||
<td>27.00 100ГР</td>
|
||
<td>/</td>
|
||
<td>Да</td>
|
||
<td>147.00</td>
|
||
<td><div class='discount-cell'>
|
||
<strong>132,00</strong>
|
||
<div class='discount-tag'>Попуст: 10.2%</div>
|
||
</div></td>
|
||
<td>Промоција</td>
|
||
<td>Важи до:<br>31.05.2025</td>
|
||
</tr>
|
||
<tr>
|
||
<td>030567</td>
|
||
<td>ВИВА ЛАДЕН ЧАЈ ПРАСКА 1.5Л</td>
|
||
<td>69.00</td>
|
||
<td>4.67 100МЛ</td>
|
||
<td>/</td>
|
||
<td>Да</td>
|
||
<td>75.00</td>
|
||
<td><div class='discount-cell'>
|
||
<strong>69,00</strong>
|
||
<div class='discount-tag'>Попуст: 8%</div>
|
||
</div></td>
|
||
<td>Промоција</td>
|
||
<td>Важи до:<br>31.05.2025</td>
|
||
</tr>
|
||
<tr>
|
||
<td>011098</td>
|
||
<td>СУПА АЛПСКА ПОДРАВКА 64Г</td>
|
||
<td>45.00</td>
|
||
<td>76.56 100ГР</td>
|
||
<td>/</td>
|
||
<td>Да</td>
|
||
<td>52.00</td>
|
||
<td><div class='discount-cell'>
|
||
<strong>45,00</strong>
|
||
<div class='discount-tag'>Попуст: 13.46%</div>
|
||
</div></td>
|
||
<td>Промоција</td>
|
||
<td>Важи до:<br>31.05.2025</td>
|
||
</tr>
|
||
<tr>
|
||
<td>038281</td>
|
||
<td>СУПА АЛПСКА КОКОШКИНА СО МЕСО 67Г</td>
|
||
<td>38.00</td>
|
||
<td>59.70 100ГР</td>
|
||
<td>/</td>
|
||
<td>Да</td>
|
||
<td>44.00</td>
|
||
<td><div class='discount-cell'>
|
||
<strong>38,00</strong>
|
||
<div class='discount-tag'>Попуст: 13.64%</div>
|
||
</div></td>
|
||
<td>Промоција</td>
|
||
<td>Важи до:<br>31.05.2025</td>
|
||
</tr>
|
||
`;
|
||
this.logger.log('Created mock HTML with 5 products based on real data structure');
|
||
var $ = cheerio.load(`<table>${mockHtml}</table>`);
|
||
this.logger.log('Loaded mock HTML into cheerio');
|
||
const rowCount = $('tr').length;
|
||
this.logger.log(`Found ${rowCount} table rows in the mock HTML`);
|
||
if (rowCount === 0) {
|
||
this.logger.error('No table rows found in the mock HTML');
|
||
throw new Error('No table rows found in the mock HTML');
|
||
}
|
||
}
|
||
else {
|
||
const response = await axios_1.default.get(sourceUrl, config);
|
||
var $ = cheerio.load(response.data);
|
||
}
|
||
const tableSelector = this.getTableSelector(sourceId);
|
||
this.logger.log(`Using table selector: ${tableSelector}`);
|
||
const productTable = $(tableSelector);
|
||
if (!productTable.length) {
|
||
this.logger.error(`Product table not found using selector: ${tableSelector}`);
|
||
throw new Error('Product table not found');
|
||
}
|
||
this.logger.log(`Product table found successfully`);
|
||
const rows = productTable.find('tr').slice(1);
|
||
this.logger.log(`Found ${rows.length} product rows`);
|
||
let processedProducts = 0;
|
||
this.logger.log(`Starting to process product rows`);
|
||
for (const row of rows.toArray()) {
|
||
try {
|
||
this.logger.log(`Parsing product row ${processedProducts + 1}/${rows.length}`);
|
||
const scrapedProduct = this.parseProductRow($, row, sourceId);
|
||
if (!scrapedProduct.name) {
|
||
this.logger.warn(`Skipping product with empty name`);
|
||
continue;
|
||
}
|
||
this.logger.log(`Processing product: ${scrapedProduct.name}`);
|
||
this.logger.log(`Product details: Regular price: ${scrapedProduct.regularPrice}, Discounted price: ${scrapedProduct.discountedPrice}, Discount percentage: ${scrapedProduct.discountPercentage}%`);
|
||
this.logger.log(`Upserting product in database: ${scrapedProduct.name}`);
|
||
try {
|
||
const product = await this.prisma.product.upsert({
|
||
where: {
|
||
name_sourceId: {
|
||
name: scrapedProduct.name,
|
||
sourceId: sourceId,
|
||
},
|
||
},
|
||
create: {
|
||
name: scrapedProduct.name,
|
||
description: scrapedProduct.description,
|
||
category: scrapedProduct.category,
|
||
availability: scrapedProduct.availability,
|
||
sourceId: sourceId,
|
||
prices: {
|
||
create: {
|
||
regularPrice: scrapedProduct.regularPrice,
|
||
discountedPrice: scrapedProduct.discountedPrice,
|
||
discountPercentage: scrapedProduct.discountPercentage,
|
||
unitPrice: scrapedProduct.unitPrice,
|
||
promotionType: scrapedProduct.promotionType,
|
||
promotionStart: scrapedProduct.promotionStart,
|
||
promotionEnd: scrapedProduct.promotionEnd,
|
||
sourceId: sourceId,
|
||
},
|
||
},
|
||
},
|
||
update: {
|
||
availability: scrapedProduct.availability,
|
||
description: scrapedProduct.description,
|
||
category: scrapedProduct.category,
|
||
prices: {
|
||
create: {
|
||
regularPrice: scrapedProduct.regularPrice,
|
||
discountedPrice: scrapedProduct.discountedPrice,
|
||
discountPercentage: scrapedProduct.discountPercentage,
|
||
unitPrice: scrapedProduct.unitPrice,
|
||
promotionType: scrapedProduct.promotionType,
|
||
promotionStart: scrapedProduct.promotionStart,
|
||
promotionEnd: scrapedProduct.promotionEnd,
|
||
sourceId: sourceId,
|
||
},
|
||
},
|
||
},
|
||
});
|
||
processedProducts++;
|
||
this.logger.log(`Successfully processed product: ${product.name}`);
|
||
this.logger.log(`Product ID: ${product.id}, Source ID: ${product.sourceId}`);
|
||
}
|
||
catch (dbError) {
|
||
this.logger.error(`Database error while upserting product: ${scrapedProduct.name}`);
|
||
this.logger.error(dbError instanceof Error ? dbError.message : 'Unknown database error');
|
||
throw dbError;
|
||
}
|
||
}
|
||
catch (error) {
|
||
if (error instanceof Error) {
|
||
this.logger.error(`Failed to process row: ${error.message}`);
|
||
}
|
||
else {
|
||
this.logger.error('Failed to process row: Unknown error');
|
||
}
|
||
}
|
||
}
|
||
const endTime = new Date();
|
||
const duration = (endTime.getTime() - startTime.getTime()) / 1000;
|
||
this.logger.log(`Scraping summary for source ID ${sourceId}:`);
|
||
this.logger.log(`- Total rows found: ${rows.length}`);
|
||
this.logger.log(`- Successfully processed products: ${processedProducts}`);
|
||
this.logger.log(`- Skipped products: ${rows.length - processedProducts}`);
|
||
this.logger.log(`- Duration: ${duration.toFixed(2)} seconds`);
|
||
this.logger.log(`Scraping completed successfully for source ID ${sourceId}`);
|
||
}
|
||
catch (error) {
|
||
if (error instanceof Error) {
|
||
this.logger.error(`Failed to scrape products from source ${sourceId}: ${error.message}`);
|
||
}
|
||
else {
|
||
this.logger.error(`Failed to scrape products from source ${sourceId}: Unknown error`);
|
||
}
|
||
throw error;
|
||
}
|
||
}
|
||
async manualScrape(sourceId) {
|
||
const source = await this.prisma.source.findUnique({
|
||
where: { id: sourceId },
|
||
});
|
||
if (!source) {
|
||
throw new Error(`Source with ID ${sourceId} not found`);
|
||
}
|
||
return this.scrapeProducts(source.url, source.id);
|
||
}
|
||
};
|
||
exports.ScraperService = ScraperService;
|
||
__decorate([
|
||
(0, schedule_1.Cron)(schedule_1.CronExpression.EVERY_HOUR),
|
||
__metadata("design:type", Function),
|
||
__metadata("design:paramtypes", []),
|
||
__metadata("design:returntype", Promise)
|
||
], ScraperService.prototype, "scrapeAllSources", null);
|
||
exports.ScraperService = ScraperService = ScraperService_1 = __decorate([
|
||
(0, common_1.Injectable)(),
|
||
__metadata("design:paramtypes", [prisma_service_1.PrismaService,
|
||
config_1.ConfigService])
|
||
], ScraperService);
|
||
//# sourceMappingURL=scraper.service.js.map
|