Skip to content

Commit

Permalink
setting up new example to process pdf files
Browse files Browse the repository at this point in the history
  • Loading branch information
pcdeadeasy committed Feb 4, 2025
1 parent 54273c4 commit 423b6c3
Show file tree
Hide file tree
Showing 11 changed files with 386 additions and 5 deletions.
16 changes: 15 additions & 1 deletion ts/.vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,21 @@
"outFiles": [
"${workspaceFolder}/**/*.js"
]
},
},
{
"name": "Launch Document Processor",
"type": "node",
"request": "launch",
"skipFiles": [
"<node_internals>/**"
],
"program": "${workspaceFolder}/examples/docuProc/src/main.ts",
"console": "integratedTerminal",
//"preLaunchTask": "pnpm: build",
"outFiles": [
"${workspaceFolder}/**/*.js"
],
},
]
}

11 changes: 11 additions & 0 deletions ts/examples/docuProc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Chat test app

This sample app is used to interactively test the document processor.

## Trademarks

This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies.
48 changes: 48 additions & 0 deletions ts/examples/docuProc/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"name": "document-processor",
"version": "0.0.1",
"private": true,
"description": "Document Processing Example",
"homepage": "https://github.com/microsoft/TypeAgent#readme",
"repository": {
"type": "git",
"url": "https://github.com/microsoft/TypeAgent.git",
"directory": "ts/examples/docuProc"
},
"license": "MIT",
"author": "Microsoft",
"type": "module",
"scripts": {
"build": "npm run tsc",
"postbuild": "copyfiles -u 1 \"src/**/*Schema.ts\" \"src/**/*.txt\" dist",
"clean": "rimraf --glob dist *.tsbuildinfo *.done.build.log",
"prettier": "prettier --check . --ignore-path ../../.prettierignore",
"prettier:fix": "prettier --write . --ignore-path ../../.prettierignore",
"run": "node dist/main.js",
"tsc": "tsc -p src"
},
"dependencies": {
"aiclient": "workspace:*",
"chalk": "^5.3.0",
"code-processor": "workspace:*",
"dotenv": "^16.3.1",
"fast-xml-parser": "4.5.1",
"interactive-app": "workspace:*",
"knowledge-processor": "workspace:*",
"memory-providers": "workspace:*",
"pdf-parse": "1.1.1",
"typeagent": "workspace:*",
"typechat": "^0.1.1",
"typescript": "^5.4.2"
},
"devDependencies": {
"@types/node": "^18.18.7",
"@types/pdf-parse": "^1.1.4",
"copyfiles": "^2.4.1",
"rimraf": "^5.0.5"
},
"engines": {
"node": ">=18",
"pnpm": ">=8"
}
}
Binary file added ts/examples/docuProc/papers/2103.00112v3.pdf
Binary file not shown.
Binary file added ts/examples/docuProc/papers/2307.01189v2.pdf
Binary file not shown.
Binary file not shown.
173 changes: 173 additions & 0 deletions ts/examples/docuProc/src/docuProc.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { ArxivQuery } from "./docuProcSchema.js";
import { XMLParser } from "fast-xml-parser";
import { fetchWithRetry } from "aiclient";
import path from "path";
import fs from "fs";
import { fileURLToPath } from "url";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

interface ArxivPaperAuthor {
name: string;
affiliation?: string;
}

interface ArxivPaper {
id: string; // arXiv ID
title: string;
author: ArxivPaperAuthor[];
summary: string;
link: string;
category?: string;
primary_category?: string;
comment?: string;
published: string;
journal_ref?: string;
}

export async function fetchArxivPapers(
query: ArxivQuery,
): Promise<any[] | undefined> {
const apiUrl = "https://export.arxiv.org/api/query";

let searchPrefix = "";
switch (query.searchField) {
case "title":
searchPrefix = "ti:";
break;
case "author":
searchPrefix = "au:";
break;
case "all":
default:
searchPrefix = "all:";
break;
}

const queryParams = new URLSearchParams({
search_query: `${searchPrefix}${query.searchTerm}`,
start: String(query.start ?? 0),
max_results: String(query.maxResults ?? 5),
sortBy: query.sortBy ?? "relevance",
sortOrder: query.sortOrder ?? "descending",
});

try {
const options: RequestInit = {
method: "GET",
headers: {
Accept: "application/xml",
},
};
const response = await fetchWithRetry(
`${apiUrl}?${queryParams}`,
options,
);

if (!response.success) {
throw new Error(`HTTP error! Status: ${response.message}`);
}
const xmlData = await response.data.text();
if (xmlData !== undefined) {
const parser = new XMLParser({ ignoreAttributes: false });
const parsedXml = parser.parse(xmlData);

const entries = parsedXml.feed.entry || [];
const papers: ArxivPaper[] = Array.isArray(entries)
? entries
: [entries];

return papers;
}
} catch (error) {
console.error("Error fetching arXiv papers:", error);
return [];
}

return undefined;
}

export function printArxivPaperParsedData(papers: ArxivPaper[]) {
if (papers.length === 0) {
console.log("No papers found.");
return;
}

papers.forEach((paper, index) => {
console.log(`Paper #${index + 1}`);
console.log("------------");

console.log(`ID: ${paper.id}`);
console.log(`Title: ${paper.title || "No title available"}`);
console.log(`Summary: ${paper.summary || "No summary available"}`);

if (paper.author?.length > 0) {
const authors = paper.author
.map((author) => {
const affiliation = author.affiliation
? ` (${author.affiliation})`
: "";
return `${author.name}${affiliation}`;
})
.join(", ");
console.log(`Authors: ${authors}`);
} else {
console.log("Authors: No authors available");
}
});
}

export function getValidFilename(paperId: string): string {
return paperId.replace(/\//g, "__");
}

export function getPaperIdFromFilename(filename: string): string {
return filename.replace(/__/g, "/");
}

function getPdfUrlFromId(id: string): { paperId: string; downloadUrl: string } {
const pid = id.split("/").slice(4).join("/");
return { paperId: `${pid}`, downloadUrl: `https://arxiv.org/pdf/${pid}` };
}

export async function downloadArxivPaper(paper: ArxivPaper) {
const arxivInfo = getPdfUrlFromId(paper.id);

const outputDir = path.join(__dirname, "papers");
const filePath = path.join(
outputDir,
`${getValidFilename(arxivInfo.paperId)}.pdf`,
);

try {
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}

const options: RequestInit = {
method: "GET",
headers: {
Accept: "application/pdf",
},
};

const response = await fetchWithRetry(arxivInfo.downloadUrl, options);
if (!response.success) {
throw new Error(`Failed to download paper: ${response.message}`);
}

const pdfBlob = await response.data.blob();
const buffer = Buffer.from(await pdfBlob.arrayBuffer());
fs.writeFileSync(filePath, buffer);

console.log(`Downloaded paper: ${filePath}`);
return filePath;
} catch (error) {
console.error("Error downloading paper:", error);
return null;
}
}
11 changes: 11 additions & 0 deletions ts/examples/docuProc/src/docuProcSchema.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

export interface ArxivQuery {
searchTerm: string;
searchField?: "title" | "author" | "all";
start?: number;
maxResults?: number;
sortBy?: "relevance" | "lastUpdatedDate" | "submittedDate";
sortOrder?: "ascending" | "descending";
}
33 changes: 33 additions & 0 deletions ts/examples/docuProc/src/main.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import dotenv from "dotenv";
import {
downloadArxivPaper,
fetchArxivPapers,
printArxivPaperParsedData,
} from "./docuProc.js";

const envPath = new URL("../../../.env", import.meta.url);
dotenv.config({ path: envPath });

console.log("Lets start processing your documents ...");
const papers: any[] | undefined = await fetchArxivPapers({
searchTerm: "transformer",
searchField: "title",
maxResults: 3,
});
if (papers !== undefined && papers.length > 0) {
console.log(`Found ${papers.length} papers`);
console.log("Downloading papers ...");
console.log("---------------------------------");

printArxivPaperParsedData(papers);
papers.forEach(async (paper) => {
try {
await downloadArxivPaper(paper);
} catch (error) {
console.error("Error downloading paper:", error);
}
});
}
6 changes: 6 additions & 0 deletions ts/examples/docuProc/src/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"extends": "../../../tsconfig.base.json",
"compilerOptions": {
"outDir": "../dist"
}
}
Loading

0 comments on commit 423b6c3

Please sign in to comment.