From 9485da66e1c6a764d15572c6951a1551425617aa Mon Sep 17 00:00:00 2001 From: Inokentii Mazhara Date: Wed, 12 Jun 2024 13:55:53 +0300 Subject: [PATCH] TW-1459 Implement a minimal demo of categorizing sites with Categoraize API --- .env.dist | 2 + src/advertising/external-ads.ts | 15 +++++ src/config.ts | 4 +- src/routers/slise-ad-rules/ad-category.ts | 64 +++++++++++++++++++ src/routers/slise-ad-rules/index.ts | 2 + src/utils/PromisifiedSemaphore.ts | 7 +-- src/utils/categoraize.ts | 76 +++++++++++++++++++++++ src/utils/makeBuildQueryFn.ts | 11 +--- src/utils/schemas.ts | 13 +++- 9 files changed, 178 insertions(+), 16 deletions(-) create mode 100644 src/routers/slise-ad-rules/ad-category.ts create mode 100644 src/utils/categoraize.ts diff --git a/.env.dist b/.env.dist index 5f4b337..3e3b27e 100644 --- a/.env.dist +++ b/.env.dist @@ -10,3 +10,5 @@ THREE_ROUTE_API_AUTH_TOKEN= REDIS_URL= ADMIN_USERNAME= ADMIN_PASSWORD= +CATEGORAIZE_CATEGORY_GROUP_ID= +CATEGORAIZE_API_KEY= diff --git a/src/advertising/external-ads.ts b/src/advertising/external-ads.ts index a735339..9f42de3 100644 --- a/src/advertising/external-ads.ts +++ b/src/advertising/external-ads.ts @@ -1,6 +1,7 @@ import { satisfies as versionSatisfiesRange } from 'semver'; import { objectStorageMethodsFactory, redisClient } from '../redis'; +import { ItemStatus as CategoraizeItemStatus } from '../utils/categoraize'; /** Style properties names that are likely to be unnecessary for banners are skipped */ export const stylePropsNames = [ @@ -143,6 +144,17 @@ export interface ReplaceAdsUrlsBlacklistEntry extends ExtVersionConstraints { regexes: string[]; } +export interface SiteCategoryRequestBody { + prompt: string; + urlExtract: string; +} + +interface SiteCategoryEntry { + itemId: string; + category?: string; + status: CategoraizeItemStatus; +} + const AD_PLACES_RULES_KEY = 'ad_places_rules'; const AD_PROVIDERS_BY_SITES_KEY = 'ad_providers_by_sites'; const AD_PROVIDERS_ALL_SITES_KEY = 'ad_providers_all_sites'; @@ -150,6 +162,7 @@ const AD_PROVIDERS_LIST_KEY = 'ad_providers_list'; const PERMANENT_AD_PLACES_RULES_KEY = 'permanent_ad_places_rules'; const PERMANENT_NATIVE_AD_PLACES_RULES_KEY = 'permanent_native_ad_places_rules'; const REPLACE_ADS_URLS_BLACKLIST_KEY = 'replace_ads_urls_blacklist'; +const SITES_CATEGORIES_KEY = 'sites_categories'; export const adPlacesRulesMethods = objectStorageMethodsFactory(AD_PLACES_RULES_KEY, []); @@ -175,6 +188,8 @@ export const replaceAdsUrlsBlacklistMethods = objectStorageMethodsFactory(SITES_CATEGORIES_KEY, null); + export const getAdProvidersForAllSites = async () => redisClient.smembers(AD_PROVIDERS_ALL_SITES_KEY); export const addAdProvidersForAllSites = async (providers: string[]) => diff --git a/src/config.ts b/src/config.ts index f5f3a20..8eb8d15 100644 --- a/src/config.ts +++ b/src/config.ts @@ -13,7 +13,9 @@ export const EnvVars = { REDIS_URL: getEnv('REDIS_URL'), ADMIN_USERNAME: getEnv('ADMIN_USERNAME'), ADMIN_PASSWORD: getEnv('ADMIN_PASSWORD'), - COVALENT_API_KEY: getEnv('COVALENT_API_KEY') + COVALENT_API_KEY: getEnv('COVALENT_API_KEY'), + CATEGORAIZE_CATEGORY_GROUP_ID: getEnv('CATEGORAIZE_CATEGORY_GROUP_ID'), + CATEGORAIZE_API_KEY: getEnv('CATEGORAIZE_API_KEY') }; for (const name in EnvVars) { diff --git a/src/routers/slise-ad-rules/ad-category.ts b/src/routers/slise-ad-rules/ad-category.ts new file mode 100644 index 0000000..bf00fec --- /dev/null +++ b/src/routers/slise-ad-rules/ad-category.ts @@ -0,0 +1,64 @@ +import { Router } from 'express'; + +import { sitesCategoriesMethods } from '../../advertising/external-ads'; +import { ItemResponse, getItem, getItemCategory, insertItem } from '../../utils/categoraize'; +import { withBodyValidation, withExceptionHandler } from '../../utils/express-helpers'; +import { isDefined } from '../../utils/helpers'; +import { siteCategoryRequestBodySchema } from '../../utils/schemas'; + +export const adCategoryRouter = Router(); + +adCategoryRouter.post( + '/', + withExceptionHandler( + withBodyValidation(siteCategoryRequestBodySchema, async (req, res) => { + const { prompt, urlExtract } = req.body; + const key = urlExtract.replace(/\/$/, ''); + const categoryEntry = await sitesCategoriesMethods.getByKey(key); + + const handleItem = async (item: ItemResponse) => { + const { id: itemId, status } = item; + + if (status === 'processed') { + const category = await getItemCategory(itemId); + await sitesCategoriesMethods.upsertValues({ [key]: { category, status: 'processed', itemId } }); + + return res.status(200).send({ status: 'processed', category }); + } + + await sitesCategoriesMethods.upsertValues({ [key]: { status, itemId } }); + + return res.status(200).send({ status }); + }; + + const createAndHandleItem = async () => { + const newItemId = await insertItem(key, prompt); + const newItem = await getItem(newItemId); + + if (!isDefined(newItem)) { + return res.status(500).send({ error: 'Failed to create item' }); + } + + await handleItem(newItem); + }; + + if (isDefined(categoryEntry) && categoryEntry.status === 'pending') { + const item = await getItem(categoryEntry.itemId); + + if (!isDefined(item)) { + await sitesCategoriesMethods.removeValues([key]); + + return createAndHandleItem(); + } + + return handleItem(item); + } + + if (isDefined(categoryEntry)) { + return res.status(200).send({ status: 'processed', category: categoryEntry.category }); + } + + await createAndHandleItem(); + }) + ) +); diff --git a/src/routers/slise-ad-rules/index.ts b/src/routers/slise-ad-rules/index.ts index ea84c98..877400e 100644 --- a/src/routers/slise-ad-rules/index.ts +++ b/src/routers/slise-ad-rules/index.ts @@ -1,5 +1,6 @@ import { Router } from 'express'; +import { adCategoryRouter } from './ad-category'; import { adPlacesRulesRouter } from './ad-places'; import { adProvidersRouter } from './providers'; import { replaceUrlsBlacklistRouter } from './replace-urls-blacklist'; @@ -40,6 +41,7 @@ import { replaceUrlsBlacklistRouter } from './replace-urls-blacklist'; export const adRulesRouter = Router(); +adRulesRouter.use('/ad-category', adCategoryRouter); adRulesRouter.use('/ad-places', adPlacesRulesRouter); adRulesRouter.use('/providers', adProvidersRouter); adRulesRouter.use('/replace-urls-blacklist', replaceUrlsBlacklistRouter); diff --git a/src/utils/PromisifiedSemaphore.ts b/src/utils/PromisifiedSemaphore.ts index af05643..fdc5624 100644 --- a/src/utils/PromisifiedSemaphore.ts +++ b/src/utils/PromisifiedSemaphore.ts @@ -15,12 +15,11 @@ export default class PromisifiedSemaphore { return this.semaphore.available(n); } - exec(task: () => void | Promise, n = 1) { - return new Promise((resolve, reject) => { + exec(task: () => T | Promise, n = 1) { + return new Promise((resolve, reject) => { this.semaphore.take(n, async () => { try { - await task(); - resolve(); + resolve(await task()); } catch (e) { reject(e); } finally { diff --git a/src/utils/categoraize.ts b/src/utils/categoraize.ts new file mode 100644 index 0000000..20e9d1b --- /dev/null +++ b/src/utils/categoraize.ts @@ -0,0 +1,76 @@ +import axios, { AxiosError } from 'axios'; + +import { EnvVars } from '../config'; +import PromisifiedSemaphore from './PromisifiedSemaphore'; + +export type ItemStatus = 'processed' | 'pending' | 'failed'; + +interface InsertItemResponse { + id: string; +} + +interface ItemCategory { + id: string; + name: string; + pathName: string; + itemCount: number; +} + +interface ItemCategoriesResponse { + categories: ItemCategory[]; +} + +export interface ItemResponse { + id: string; + catGroupId: string; + extId: string; + status: ItemStatus; + manualTitle: string; + content: string; + created: string; +} + +const categoraizeApi = axios.create({ + baseURL: 'https://categoraize.io/api/v1', + headers: { + Authorization: `Bearer ${EnvVars.CATEGORAIZE_API_KEY}` + } +}); +const apiSemaphore = new PromisifiedSemaphore(5); + +export const insertItem = (extId: string, content: string) => + apiSemaphore.exec(async () => { + const response = await categoraizeApi.post( + `/category-groups/${EnvVars.CATEGORAIZE_CATEGORY_GROUP_ID}/items`, + { + extId, + manualTitle: extId, + content, + attachments: [] + } + ); + + return response.data.id; + }); + +export const getItem = (id: string) => + apiSemaphore.exec(async () => { + try { + const response = await categoraizeApi.get(`/items/${id}`); + + return response.data; + } catch (e) { + if (e instanceof AxiosError && e.response?.status === 404) { + return null; + } + + throw e; + } + }); + +export const getItemCategory = (id: string) => + apiSemaphore.exec(async () => { + const response = await categoraizeApi.get(`/items/${id}/categories`); + + return response.data.categories[0]?.name; + }); diff --git a/src/utils/makeBuildQueryFn.ts b/src/utils/makeBuildQueryFn.ts index 116a183..7a4def5 100644 --- a/src/utils/makeBuildQueryFn.ts +++ b/src/utils/makeBuildQueryFn.ts @@ -43,16 +43,7 @@ export function makeBuildQueryFn

( }; if (semaphore) { - return new Promise((resolve, reject) => { - semaphore.exec(async () => { - try { - const data = await getData(); - resolve(data); - } catch (e) { - reject(e); - } - }); - }); + return await semaphore.exec(getData); } return getData(); diff --git a/src/utils/schemas.ts b/src/utils/schemas.ts index 21e0253..645839b 100644 --- a/src/utils/schemas.ts +++ b/src/utils/schemas.ts @@ -18,7 +18,8 @@ import { StylePropName, stylePropsNames, AdProviderSelectorsRule, - ReplaceAdsUrlsBlacklistEntry + ReplaceAdsUrlsBlacklistEntry, + SiteCategoryRequestBody } from '../advertising/external-ads'; import { isValidSelectorsGroup } from '../utils/selectors.min.js'; import { isDefined } from './helpers'; @@ -231,3 +232,13 @@ export const replaceUrlsBlacklistDictionarySchema: IObjectSchema = objectSchema().shape({ + urlExtract: stringSchema() + .matches( + /^(www\.)?([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(\/(?:[a-zA-Z0-9\-._~%!$&'()*+,;=:@]|%[0-9a-fA-F]{2})*)*$/, + 'urlExtract must be a fragment of a valid URL without protocol, query parameters or hash' + ) + .required(), + prompt: stringSchema().min(1, 'prompt must be a non-empty string').required() +});