Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TW-1459 Implement a minimal demo of categorizing sites with Categorai… #163

Draft
wants to merge 1 commit into
base: development
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.dist
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ THREE_ROUTE_API_AUTH_TOKEN=
REDIS_URL=
ADMIN_USERNAME=
ADMIN_PASSWORD=
CATEGORAIZE_CATEGORY_GROUP_ID=
CATEGORAIZE_API_KEY=
15 changes: 15 additions & 0 deletions src/advertising/external-ads.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { satisfies as versionSatisfiesRange } from 'semver';

import { objectStorageMethodsFactory, redisClient } from '../redis';
import { ItemStatus as CategoraizeItemStatus } from '../utils/categoraize';

/** Style properties names that are likely to be unnecessary for banners are skipped */
export const stylePropsNames = [
Expand Down Expand Up @@ -143,13 +144,25 @@ export interface ReplaceAdsUrlsBlacklistEntry extends ExtVersionConstraints {
regexes: string[];
}

export interface SiteCategoryRequestBody {
prompt: string;
urlExtract: string;
}

interface SiteCategoryEntry {
itemId: string;
category?: string;
status: CategoraizeItemStatus;
}

const AD_PLACES_RULES_KEY = 'ad_places_rules';
const AD_PROVIDERS_BY_SITES_KEY = 'ad_providers_by_sites';
const AD_PROVIDERS_ALL_SITES_KEY = 'ad_providers_all_sites';
const AD_PROVIDERS_LIST_KEY = 'ad_providers_list';
const PERMANENT_AD_PLACES_RULES_KEY = 'permanent_ad_places_rules';
const PERMANENT_NATIVE_AD_PLACES_RULES_KEY = 'permanent_native_ad_places_rules';
const REPLACE_ADS_URLS_BLACKLIST_KEY = 'replace_ads_urls_blacklist';
const SITES_CATEGORIES_KEY = 'sites_categories';

export const adPlacesRulesMethods = objectStorageMethodsFactory<AdPlacesRule[]>(AD_PLACES_RULES_KEY, []);

Expand All @@ -175,6 +188,8 @@ export const replaceAdsUrlsBlacklistMethods = objectStorageMethodsFactory<Replac
[]
);

export const sitesCategoriesMethods = objectStorageMethodsFactory<SiteCategoryEntry | null>(SITES_CATEGORIES_KEY, null);

export const getAdProvidersForAllSites = async () => redisClient.smembers(AD_PROVIDERS_ALL_SITES_KEY);

export const addAdProvidersForAllSites = async (providers: string[]) =>
Expand Down
4 changes: 3 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ export const EnvVars = {
REDIS_URL: getEnv('REDIS_URL'),
ADMIN_USERNAME: getEnv('ADMIN_USERNAME'),
ADMIN_PASSWORD: getEnv('ADMIN_PASSWORD'),
COVALENT_API_KEY: getEnv('COVALENT_API_KEY')
COVALENT_API_KEY: getEnv('COVALENT_API_KEY'),
CATEGORAIZE_CATEGORY_GROUP_ID: getEnv('CATEGORAIZE_CATEGORY_GROUP_ID'),
CATEGORAIZE_API_KEY: getEnv('CATEGORAIZE_API_KEY')
};

for (const name in EnvVars) {
Expand Down
64 changes: 64 additions & 0 deletions src/routers/slise-ad-rules/ad-category.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import { Router } from 'express';

import { sitesCategoriesMethods } from '../../advertising/external-ads';
import { ItemResponse, getItem, getItemCategory, insertItem } from '../../utils/categoraize';
import { withBodyValidation, withExceptionHandler } from '../../utils/express-helpers';
import { isDefined } from '../../utils/helpers';
import { siteCategoryRequestBodySchema } from '../../utils/schemas';

export const adCategoryRouter = Router();

adCategoryRouter.post(
'/',
withExceptionHandler(
withBodyValidation(siteCategoryRequestBodySchema, async (req, res) => {
const { prompt, urlExtract } = req.body;
const key = urlExtract.replace(/\/$/, '');
const categoryEntry = await sitesCategoriesMethods.getByKey(key);

const handleItem = async (item: ItemResponse) => {
const { id: itemId, status } = item;

if (status === 'processed') {
const category = await getItemCategory(itemId);
await sitesCategoriesMethods.upsertValues({ [key]: { category, status: 'processed', itemId } });

return res.status(200).send({ status: 'processed', category });
}

await sitesCategoriesMethods.upsertValues({ [key]: { status, itemId } });

return res.status(200).send({ status });
};

const createAndHandleItem = async () => {
const newItemId = await insertItem(key, prompt);
const newItem = await getItem(newItemId);

if (!isDefined(newItem)) {
return res.status(500).send({ error: 'Failed to create item' });
}

await handleItem(newItem);
};

if (isDefined(categoryEntry) && categoryEntry.status === 'pending') {
const item = await getItem(categoryEntry.itemId);

if (!isDefined(item)) {
await sitesCategoriesMethods.removeValues([key]);

return createAndHandleItem();
}

return handleItem(item);
}

if (isDefined(categoryEntry)) {
return res.status(200).send({ status: 'processed', category: categoryEntry.category });
}

await createAndHandleItem();
})
)
);
2 changes: 2 additions & 0 deletions src/routers/slise-ad-rules/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Router } from 'express';

import { adCategoryRouter } from './ad-category';
import { adPlacesRulesRouter } from './ad-places';
import { adProvidersRouter } from './providers';
import { replaceUrlsBlacklistRouter } from './replace-urls-blacklist';
Expand Down Expand Up @@ -40,6 +41,7 @@ import { replaceUrlsBlacklistRouter } from './replace-urls-blacklist';

export const adRulesRouter = Router();

adRulesRouter.use('/ad-category', adCategoryRouter);
adRulesRouter.use('/ad-places', adPlacesRulesRouter);
adRulesRouter.use('/providers', adProvidersRouter);
adRulesRouter.use('/replace-urls-blacklist', replaceUrlsBlacklistRouter);
7 changes: 3 additions & 4 deletions src/utils/PromisifiedSemaphore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@ export default class PromisifiedSemaphore {
return this.semaphore.available(n);
}

exec(task: () => void | Promise<void>, n = 1) {
return new Promise<void>((resolve, reject) => {
exec<T = void>(task: () => T | Promise<T>, n = 1) {
return new Promise<T>((resolve, reject) => {
this.semaphore.take(n, async () => {
try {
await task();
resolve();
resolve(await task());
} catch (e) {
reject(e);
} finally {
Expand Down
76 changes: 76 additions & 0 deletions src/utils/categoraize.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import axios, { AxiosError } from 'axios';

import { EnvVars } from '../config';
import PromisifiedSemaphore from './PromisifiedSemaphore';

export type ItemStatus = 'processed' | 'pending' | 'failed';

interface InsertItemResponse {
id: string;
}

interface ItemCategory {
id: string;
name: string;
pathName: string;
itemCount: number;
}

interface ItemCategoriesResponse {
categories: ItemCategory[];
}

export interface ItemResponse {
id: string;
catGroupId: string;
extId: string;
status: ItemStatus;
manualTitle: string;
content: string;
created: string;
}

const categoraizeApi = axios.create({
baseURL: 'https://categoraize.io/api/v1',
headers: {
Authorization: `Bearer ${EnvVars.CATEGORAIZE_API_KEY}`
}
});
const apiSemaphore = new PromisifiedSemaphore(5);

export const insertItem = (extId: string, content: string) =>
apiSemaphore.exec(async () => {
const response = await categoraizeApi.post<InsertItemResponse>(
`/category-groups/${EnvVars.CATEGORAIZE_CATEGORY_GROUP_ID}/items`,
{
extId,
manualTitle: extId,
content,
attachments: []
}
);

return response.data.id;
});

export const getItem = (id: string) =>
apiSemaphore.exec(async () => {
try {
const response = await categoraizeApi.get<ItemResponse>(`/items/${id}`);

return response.data;
} catch (e) {
if (e instanceof AxiosError && e.response?.status === 404) {
return null;
}

throw e;
}
});

export const getItemCategory = (id: string) =>
apiSemaphore.exec<string | undefined>(async () => {
const response = await categoraizeApi.get<ItemCategoriesResponse>(`/items/${id}/categories`);

return response.data.categories[0]?.name;
});
11 changes: 1 addition & 10 deletions src/utils/makeBuildQueryFn.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,7 @@ export function makeBuildQueryFn<P extends object, R>(
};

if (semaphore) {
return new Promise<R1>((resolve, reject) => {
semaphore.exec(async () => {
try {
const data = await getData();
resolve(data);
} catch (e) {
reject(e);
}
});
});
return await semaphore.exec<R1>(getData);
}

return getData();
Expand Down
13 changes: 12 additions & 1 deletion src/utils/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ import {
StylePropName,
stylePropsNames,
AdProviderSelectorsRule,
ReplaceAdsUrlsBlacklistEntry
ReplaceAdsUrlsBlacklistEntry,
SiteCategoryRequestBody
} from '../advertising/external-ads';
import { isValidSelectorsGroup } from '../utils/selectors.min.js';
import { isDefined } from './helpers';
Expand Down Expand Up @@ -231,3 +232,13 @@ export const replaceUrlsBlacklistDictionarySchema: IObjectSchema<Record<string,
nonEmptyStringSchema.clone().required(),
arraySchema().of(replaceUrlsBlacklistEntrySchema.clone().required()).required()
).required();

export const siteCategoryRequestBodySchema: IObjectSchema<SiteCategoryRequestBody> = objectSchema().shape({
urlExtract: stringSchema()
.matches(
/^(www\.)?([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(\/(?:[a-zA-Z0-9\-._~%!$&'()*+,;=:@]|%[0-9a-fA-F]{2})*)*$/,
'urlExtract must be a fragment of a valid URL without protocol, query parameters or hash'
)
.required(),
prompt: stringSchema().min(1, 'prompt must be a non-empty string').required()
});
Loading