-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
setting up new example to process pdf files
- Loading branch information
1 parent
54273c4
commit 423b6c3
Showing
11 changed files
with
386 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
## Chat test app | ||
|
||
This sample app is used to interactively test the document processor. | ||
|
||
## Trademarks | ||
|
||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft | ||
trademarks or logos is subject to and must follow | ||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). | ||
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. | ||
Any use of third-party trademarks or logos are subject to those third-party's policies. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
"name": "document-processor", | ||
"version": "0.0.1", | ||
"private": true, | ||
"description": "Document Processing Example", | ||
"homepage": "https://github.com/microsoft/TypeAgent#readme", | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/microsoft/TypeAgent.git", | ||
"directory": "ts/examples/docuProc" | ||
}, | ||
"license": "MIT", | ||
"author": "Microsoft", | ||
"type": "module", | ||
"scripts": { | ||
"build": "npm run tsc", | ||
"postbuild": "copyfiles -u 1 \"src/**/*Schema.ts\" \"src/**/*.txt\" dist", | ||
"clean": "rimraf --glob dist *.tsbuildinfo *.done.build.log", | ||
"prettier": "prettier --check . --ignore-path ../../.prettierignore", | ||
"prettier:fix": "prettier --write . --ignore-path ../../.prettierignore", | ||
"run": "node dist/main.js", | ||
"tsc": "tsc -p src" | ||
}, | ||
"dependencies": { | ||
"aiclient": "workspace:*", | ||
"chalk": "^5.3.0", | ||
"code-processor": "workspace:*", | ||
"dotenv": "^16.3.1", | ||
"fast-xml-parser": "4.5.1", | ||
"interactive-app": "workspace:*", | ||
"knowledge-processor": "workspace:*", | ||
"memory-providers": "workspace:*", | ||
"pdf-parse": "1.1.1", | ||
"typeagent": "workspace:*", | ||
"typechat": "^0.1.1", | ||
"typescript": "^5.4.2" | ||
}, | ||
"devDependencies": { | ||
"@types/node": "^18.18.7", | ||
"@types/pdf-parse": "^1.1.4", | ||
"copyfiles": "^2.4.1", | ||
"rimraf": "^5.0.5" | ||
}, | ||
"engines": { | ||
"node": ">=18", | ||
"pnpm": ">=8" | ||
} | ||
} |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT License. | ||
|
||
import { ArxivQuery } from "./docuProcSchema.js"; | ||
import { XMLParser } from "fast-xml-parser"; | ||
import { fetchWithRetry } from "aiclient"; | ||
import path from "path"; | ||
import fs from "fs"; | ||
import { fileURLToPath } from "url"; | ||
|
||
const __filename = fileURLToPath(import.meta.url); | ||
const __dirname = path.dirname(__filename); | ||
|
||
interface ArxivPaperAuthor { | ||
name: string; | ||
affiliation?: string; | ||
} | ||
|
||
interface ArxivPaper { | ||
id: string; // arXiv ID | ||
title: string; | ||
author: ArxivPaperAuthor[]; | ||
summary: string; | ||
link: string; | ||
category?: string; | ||
primary_category?: string; | ||
comment?: string; | ||
published: string; | ||
journal_ref?: string; | ||
} | ||
|
||
export async function fetchArxivPapers( | ||
query: ArxivQuery, | ||
): Promise<any[] | undefined> { | ||
const apiUrl = "https://export.arxiv.org/api/query"; | ||
|
||
let searchPrefix = ""; | ||
switch (query.searchField) { | ||
case "title": | ||
searchPrefix = "ti:"; | ||
break; | ||
case "author": | ||
searchPrefix = "au:"; | ||
break; | ||
case "all": | ||
default: | ||
searchPrefix = "all:"; | ||
break; | ||
} | ||
|
||
const queryParams = new URLSearchParams({ | ||
search_query: `${searchPrefix}${query.searchTerm}`, | ||
start: String(query.start ?? 0), | ||
max_results: String(query.maxResults ?? 5), | ||
sortBy: query.sortBy ?? "relevance", | ||
sortOrder: query.sortOrder ?? "descending", | ||
}); | ||
|
||
try { | ||
const options: RequestInit = { | ||
method: "GET", | ||
headers: { | ||
Accept: "application/xml", | ||
}, | ||
}; | ||
const response = await fetchWithRetry( | ||
`${apiUrl}?${queryParams}`, | ||
options, | ||
); | ||
|
||
if (!response.success) { | ||
throw new Error(`HTTP error! Status: ${response.message}`); | ||
} | ||
const xmlData = await response.data.text(); | ||
if (xmlData !== undefined) { | ||
const parser = new XMLParser({ ignoreAttributes: false }); | ||
const parsedXml = parser.parse(xmlData); | ||
|
||
const entries = parsedXml.feed.entry || []; | ||
const papers: ArxivPaper[] = Array.isArray(entries) | ||
? entries | ||
: [entries]; | ||
|
||
return papers; | ||
} | ||
} catch (error) { | ||
console.error("Error fetching arXiv papers:", error); | ||
return []; | ||
} | ||
|
||
return undefined; | ||
} | ||
|
||
export function printArxivPaperParsedData(papers: ArxivPaper[]) { | ||
if (papers.length === 0) { | ||
console.log("No papers found."); | ||
return; | ||
} | ||
|
||
papers.forEach((paper, index) => { | ||
console.log(`Paper #${index + 1}`); | ||
console.log("------------"); | ||
|
||
console.log(`ID: ${paper.id}`); | ||
console.log(`Title: ${paper.title || "No title available"}`); | ||
console.log(`Summary: ${paper.summary || "No summary available"}`); | ||
|
||
if (paper.author?.length > 0) { | ||
const authors = paper.author | ||
.map((author) => { | ||
const affiliation = author.affiliation | ||
? ` (${author.affiliation})` | ||
: ""; | ||
return `${author.name}${affiliation}`; | ||
}) | ||
.join(", "); | ||
console.log(`Authors: ${authors}`); | ||
} else { | ||
console.log("Authors: No authors available"); | ||
} | ||
}); | ||
} | ||
|
||
export function getValidFilename(paperId: string): string { | ||
return paperId.replace(/\//g, "__"); | ||
} | ||
|
||
export function getPaperIdFromFilename(filename: string): string { | ||
return filename.replace(/__/g, "/"); | ||
} | ||
|
||
function getPdfUrlFromId(id: string): { paperId: string; downloadUrl: string } { | ||
const pid = id.split("/").slice(4).join("/"); | ||
return { paperId: `${pid}`, downloadUrl: `https://arxiv.org/pdf/${pid}` }; | ||
} | ||
|
||
export async function downloadArxivPaper(paper: ArxivPaper) { | ||
const arxivInfo = getPdfUrlFromId(paper.id); | ||
|
||
const outputDir = path.join(__dirname, "papers"); | ||
const filePath = path.join( | ||
outputDir, | ||
`${getValidFilename(arxivInfo.paperId)}.pdf`, | ||
); | ||
|
||
try { | ||
if (!fs.existsSync(outputDir)) { | ||
fs.mkdirSync(outputDir, { recursive: true }); | ||
} | ||
|
||
const options: RequestInit = { | ||
method: "GET", | ||
headers: { | ||
Accept: "application/pdf", | ||
}, | ||
}; | ||
|
||
const response = await fetchWithRetry(arxivInfo.downloadUrl, options); | ||
if (!response.success) { | ||
throw new Error(`Failed to download paper: ${response.message}`); | ||
} | ||
|
||
const pdfBlob = await response.data.blob(); | ||
const buffer = Buffer.from(await pdfBlob.arrayBuffer()); | ||
fs.writeFileSync(filePath, buffer); | ||
|
||
console.log(`Downloaded paper: ${filePath}`); | ||
return filePath; | ||
} catch (error) { | ||
console.error("Error downloading paper:", error); | ||
return null; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT License. | ||
|
||
export interface ArxivQuery { | ||
searchTerm: string; | ||
searchField?: "title" | "author" | "all"; | ||
start?: number; | ||
maxResults?: number; | ||
sortBy?: "relevance" | "lastUpdatedDate" | "submittedDate"; | ||
sortOrder?: "ascending" | "descending"; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT License. | ||
|
||
import dotenv from "dotenv"; | ||
import { | ||
downloadArxivPaper, | ||
fetchArxivPapers, | ||
printArxivPaperParsedData, | ||
} from "./docuProc.js"; | ||
|
||
const envPath = new URL("../../../.env", import.meta.url); | ||
dotenv.config({ path: envPath }); | ||
|
||
console.log("Lets start processing your documents ..."); | ||
const papers: any[] | undefined = await fetchArxivPapers({ | ||
searchTerm: "transformer", | ||
searchField: "title", | ||
maxResults: 3, | ||
}); | ||
if (papers !== undefined && papers.length > 0) { | ||
console.log(`Found ${papers.length} papers`); | ||
console.log("Downloading papers ..."); | ||
console.log("---------------------------------"); | ||
|
||
printArxivPaperParsedData(papers); | ||
papers.forEach(async (paper) => { | ||
try { | ||
await downloadArxivPaper(paper); | ||
} catch (error) { | ||
console.error("Error downloading paper:", error); | ||
} | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"extends": "../../../tsconfig.base.json", | ||
"compilerOptions": { | ||
"outDir": "../dist" | ||
} | ||
} |
Oops, something went wrong.