diff --git a/package.json b/package.json index 839c107..0d79070 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,8 @@ }, "dependencies": { "cross-fetch": "^4.1.0", - "node-html-parser": "^7.0.2" + "node-html-parser": "^7.0.2", + "valibot": "^1.2.0" }, "devDependencies": { "@babel/preset-typescript": "^7.28.5", @@ -98,8 +99,8 @@ "types": "./dist/index.d.ts", "exports": { ".": { - "require": "./dist/index.js", - "import": "./dist/index.mjs" + "import": "./dist/index.mjs", + "require": "./dist/index.js" }, "./package.json": "./package.json" }, diff --git a/src/dto/user-ratings.schema.ts b/src/dto/user-ratings.schema.ts new file mode 100644 index 0000000..c983a2f --- /dev/null +++ b/src/dto/user-ratings.schema.ts @@ -0,0 +1,42 @@ +import * as v from 'valibot'; + +export const CSFDColorRatingSchema = v.union([ + v.literal('bad'), + v.literal('average'), + v.literal('good'), + v.literal('unknown') +]); + +export const CSFDStarsSchema = v.union([ + v.literal(0), + v.literal(1), + v.literal(2), + v.literal(3), + v.literal(4), + v.literal(5) +]); + +export const CSFDFilmTypesSchema = v.union([ + v.literal('film'), + v.literal('TV film'), + v.literal('pořad'), + v.literal('seriál'), + v.literal('divadelní záznam'), + v.literal('koncert'), + v.literal('série'), + v.literal('studentský film'), + v.literal('amatérský film'), + v.literal('hudební videoklip'), + v.literal('epizoda') +]); + +export const CSFDUserRatingsSchema = v.object({ + id: v.number(), + title: v.string(), + year: v.number(), + url: v.string(), + type: CSFDFilmTypesSchema, + colorRating: CSFDColorRatingSchema, + userRating: CSFDStarsSchema, + userDate: v.string() +}); diff --git a/src/dto/user-reviews.schema.ts b/src/dto/user-reviews.schema.ts new file mode 100644 index 0000000..2e2ee44 --- /dev/null +++ b/src/dto/user-reviews.schema.ts @@ -0,0 +1,15 @@ +import * as v from 'valibot'; +import { CSFDColorRatingSchema, CSFDFilmTypesSchema, CSFDStarsSchema } from './user-ratings.schema'; + +export const CSFDUserReviewsSchema = v.object({ + id: v.number(), + title: v.string(), + year: v.number(), + url: v.string(), + type: CSFDFilmTypesSchema, + colorRating: CSFDColorRatingSchema, + userRating: CSFDStarsSchema, + userDate: v.string(), + text: v.string(), + poster: v.nullable(v.string()) +}); diff --git a/src/dto/user-reviews.ts b/src/dto/user-reviews.ts index 46ce4e0..9632d8e 100644 --- a/src/dto/user-reviews.ts +++ b/src/dto/user-reviews.ts @@ -4,7 +4,7 @@ export interface CSFDUserReviews extends CSFDScreening { userRating: CSFDStars; userDate: string; // TODO datetime text: string; - poster: string; + poster: string | null; } export interface CSFDUserReviewsConfig { diff --git a/src/helpers/user-reviews.helper.ts b/src/helpers/user-reviews.helper.ts index 1d61118..5b60c9b 100644 --- a/src/helpers/user-reviews.helper.ts +++ b/src/helpers/user-reviews.helper.ts @@ -19,7 +19,14 @@ export const getUserReviewType = (el: HTMLElement): CSFDFilmTypes => { // Type can be in the second .info span (e.g., "(seriál)") // TODO need more tests const typeText = el.querySelectorAll('.film-title-info .info'); - return (typeText.length > 1 ? typeText[1].text.slice(1, -1) : 'film') as CSFDFilmTypes; + if (typeText.length > 1) { + const text = typeText[1].text.trim(); + if (text.startsWith('(') && text.endsWith(')')) { + return text.slice(1, -1) as CSFDFilmTypes; + } + return text as CSFDFilmTypes; + } + return 'film'; }; export const getUserReviewTitle = (el: HTMLElement): string => { diff --git a/src/services/user-ratings.service.ts b/src/services/user-ratings.service.ts index 67d665d..1dac474 100644 --- a/src/services/user-ratings.service.ts +++ b/src/services/user-ratings.service.ts @@ -1,6 +1,8 @@ import { HTMLElement, parse } from 'node-html-parser'; +import { flatten, safeParse } from 'valibot'; import { CSFDColorRating, CSFDStars } from '../dto/global'; import { CSFDUserRatingConfig, CSFDUserRatings } from '../dto/user-ratings'; +import { CSFDUserRatingsSchema } from '../dto/user-ratings.schema'; import { fetchPage } from '../fetchers'; import { sleep } from '../helpers/global.helper'; import { @@ -76,19 +78,37 @@ export class UserRatingsScraper { for (const el of movies) { const type = getUserRatingType(el); + let shouldProcess = true; + // Filtering includesOnly if (config?.includesOnly?.length) { - if (config.includesOnly.some((include) => type === include)) { - films.push(this.buildUserRatings(el)); + if (!config.includesOnly.some((include) => type === include)) { + shouldProcess = false; } // Filter excludes } else if (config?.excludes?.length) { - if (!config.excludes.some((exclude) => type === exclude)) { - films.push(this.buildUserRatings(el)); + if (config.excludes.some((exclude) => type === exclude)) { + shouldProcess = false; + } + } + + if (shouldProcess) { + try { + const item = this.buildUserRatings(el); + const result = safeParse(CSFDUserRatingsSchema, item); + if (result.success) { + films.push(result.output as CSFDUserRatings); + } else { + console.warn( + `Skipping invalid user rating. Title: ${item.title}, ID: ${item.id}`, + JSON.stringify(flatten(result.issues)) + ); + } + } catch (e) { + console.warn( + `Skipping user rating due to scraping error (DOM change?): ${(e as Error).message}` + ); } - } else { - // Without filtering - films.push(this.buildUserRatings(el)); } } return films; diff --git a/src/services/user-reviews.service.ts b/src/services/user-reviews.service.ts index 0833518..3ff9ad6 100644 --- a/src/services/user-reviews.service.ts +++ b/src/services/user-reviews.service.ts @@ -1,6 +1,8 @@ import { HTMLElement, parse } from 'node-html-parser'; +import { flatten, safeParse } from 'valibot'; import { CSFDColorRating, CSFDStars } from '../dto/global'; import { CSFDUserReviews, CSFDUserReviewsConfig } from '../dto/user-reviews'; +import { CSFDUserReviewsSchema } from '../dto/user-reviews.schema'; import { fetchPage } from '../fetchers'; import { sleep } from '../helpers/global.helper'; import { @@ -78,19 +80,37 @@ export class UserReviewsScraper { for (const el of reviews) { const type = getUserReviewType(el); + let shouldProcess = true; + // Filtering includesOnly if (config?.includesOnly?.length) { - if (config.includesOnly.some((include) => type === include)) { - films.push(this.buildUserReviews(el)); + if (!config.includesOnly.some((include) => type === include)) { + shouldProcess = false; } // Filter excludes } else if (config?.excludes?.length) { - if (!config.excludes.some((exclude) => type === exclude)) { - films.push(this.buildUserReviews(el)); + if (config.excludes.some((exclude) => type === exclude)) { + shouldProcess = false; + } + } + + if (shouldProcess) { + try { + const item = this.buildUserReviews(el); + const result = safeParse(CSFDUserReviewsSchema, item); + if (result.success) { + films.push(result.output as CSFDUserReviews); + } else { + console.warn( + `Skipping invalid user review. Title: ${item.title}, ID: ${item.id}`, + JSON.stringify(flatten(result.issues)) + ); + } + } catch (e) { + console.warn( + `Skipping user review due to scraping error (DOM change?): ${(e as Error).message}` + ); } - } else { - // Without filtering - films.push(this.buildUserReviews(el)); } } return films; diff --git a/yarn.lock b/yarn.lock index bc6ee6a..1c86bed 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3156,6 +3156,11 @@ uri-js@^4.2.2: dependencies: punycode "^2.1.0" +valibot@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/valibot/-/valibot-1.2.0.tgz#8fc720d9e4082ba16e30a914064a39619b2f1d6f" + integrity sha512-mm1rxUsmOxzrwnX5arGS+U4T25RdvpPjPN4yR0u9pUBov9+zGVtO84tif1eY4r6zWxVxu3KzIyknJy3rxfRZZg== + vary@^1.1.2: version "1.1.2" resolved "https://registry.yarnpkg.com/vary/-/vary-1.1.2.tgz#2299f02c6ded30d4a5961b0b9f74524a18f634fc"