diff --git a/Cargo.lock b/Cargo.lock index 7a30139e5..78c7326d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -113,7 +113,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rand 0.8.5", - "sha1", + "sha1 0.10.4", "smallvec", "tracing", "zstd", @@ -334,6 +334,15 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" +[[package]] +name = "adobe-cmap-parser" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3aaf5066d68c8ec9656cfd3a96bc9de83d4883f183d6c6b8d742e36a4819dda" +dependencies = [ + "pom 1.1.0", +] + [[package]] name = "ahash" version = "0.7.6" @@ -395,9 +404,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.64" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9a8f622bcf6ff3df478e9deba3e03e4e04b300f8e6a139e192c05fa3490afc7" +checksum = "216261ddc8289130e551ddcd5ce8a064710c0d064a4d2895c67151c92b5443f6" [[package]] name = "arc-swap" @@ -488,6 +497,7 @@ dependencies = [ "actix-web-static-files", "assert_cmd", "atomic_lib", + "atomizer", "base64 0.13.0", "chrono", "clap 4.0.26", @@ -566,6 +576,16 @@ dependencies = [ "urlencoding", ] +[[package]] +name = "atomizer" +version = "0.1.0" +dependencies = [ + "atomic_lib", + "kamadak-exif", + "mime_guess", + "pdf-extract", +] + [[package]] name = "attohttpc" version = "0.19.1" @@ -601,6 +621,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "base-x" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" + [[package]] name = "base64" version = "0.13.0" @@ -1012,6 +1038,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "const_fn" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935" + [[package]] name = "convert_case" version = "0.4.0" @@ -1353,9 +1385,9 @@ checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" [[package]] name = "digest" -version = "0.10.3" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" dependencies = [ "block-buffer", "crypto-common", @@ -1411,6 +1443,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "discard" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0" + [[package]] name = "dispatch" version = "0.2.0" @@ -1491,6 +1529,70 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "encoding_rs" version = "0.8.31" @@ -1537,6 +1639,15 @@ dependencies = [ "str-buf", ] +[[package]] +name = "euclid" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" +dependencies = [ + "num-traits", +] + [[package]] name = "fail" version = "0.5.0" @@ -2493,6 +2604,15 @@ dependencies = [ "treediff", ] +[[package]] +name = "kamadak-exif" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef4fc70d0ab7e5b6bafa30216a6b48705ea964cdfc29c050f2412295eba58077" +dependencies = [ + "mutate_once", +] + [[package]] name = "kuchiki" version = "0.8.1" @@ -2587,6 +2707,12 @@ dependencies = [ "safemem", ] +[[package]] +name = "linked-hash-map" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a" + [[package]] name = "linux-raw-sys" version = "0.0.46" @@ -2665,6 +2791,23 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "lopdf" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49a0272112719d0037ab63d4bb67f73ba659e1e90bc38f235f163a457ac16f3" +dependencies = [ + "dtoa", + "encoding", + "flate2", + "itoa 0.4.8", + "linked-hash-map", + "log", + "lzw", + "pom 3.2.0", + "time 0.2.27", +] + [[package]] name = "lru" version = "0.7.8" @@ -2680,6 +2823,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a8cbbb2831780bc3b9c15a41f5b49222ef756b6730a95f3decfdd15903eb5a3" +[[package]] +name = "lzw" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d947cbb889ed21c2a84be6ffbaebf5b4e0f4340638cba0444907e38b56be084" + [[package]] name = "mac" version = "0.1.1" @@ -2835,6 +2984,12 @@ dependencies = [ "byteorder", ] +[[package]] +name = "mutate_once" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16cf681a23b4d0a43fc35024c176437f9dcd818db34e0f42ab456a0ee5ad497b" + [[package]] name = "native-tls" version = "0.2.10" @@ -3398,6 +3553,22 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd" +[[package]] +name = "pdf-extract" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7875466ea3ecc4b763c4946993d5dcdf4f6e3a67e2f293e506a4a9ec551759" +dependencies = [ + "adobe-cmap-parser", + "encoding", + "euclid", + "linked-hash-map", + "lopdf", + "postscript", + "type1-encoding-parser", + "unicode-normalization", +] + [[package]] name = "pem" version = "1.1.1" @@ -3625,6 +3796,24 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "pom" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" + +[[package]] +name = "pom" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e2192780e9f8e282049ff9bffcaa28171e1cb0844f49ed5374e518ae6024ec" + +[[package]] +name = "postscript" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" + [[package]] name = "ppv-lite86" version = "0.2.16" @@ -4013,6 +4202,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.3.3" @@ -4214,13 +4412,22 @@ dependencies = [ "thin-slice", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser 0.7.0", +] + [[package]] name = "semver" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" dependencies = [ - "semver-parser", + "semver-parser 0.10.2", ] [[package]] @@ -4232,6 +4439,12 @@ dependencies = [ "serde", ] +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "semver-parser" version = "0.10.2" @@ -4263,9 +4476,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.85" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" +checksum = "020ff22c755c2ed3f8cf162dbb41a7268d934702f3ed3631656ea597e08fc3db" dependencies = [ "itoa 1.0.3", "ryu", @@ -4358,6 +4571,15 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "sha1" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1da05c97445caa12d05e848c4a4fcbbea29e748ac28f7e80e9b010392063770" +dependencies = [ + "sha1_smol", +] + [[package]] name = "sha1" version = "0.10.4" @@ -4369,6 +4591,12 @@ dependencies = [ "digest", ] +[[package]] +name = "sha1_smol" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" + [[package]] name = "sha2" version = "0.10.5" @@ -4447,9 +4675,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" @@ -4501,6 +4729,15 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "standback" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e113fb6f3de07a243d434a56ec6f186dfd51cb08448239fe7bcae73f87ff28ff" +dependencies = [ + "version_check", +] + [[package]] name = "state" version = "0.5.3" @@ -4521,6 +4758,55 @@ dependencies = [ "path-slash", ] +[[package]] +name = "stdweb" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5" +dependencies = [ + "discard", + "rustc_version 0.2.3", + "stdweb-derive", + "stdweb-internal-macros", + "stdweb-internal-runtime", + "wasm-bindgen", +] + +[[package]] +name = "stdweb-derive" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c87a60a40fccc84bef0652345bbbbbe20a605bf5d0ce81719fc476f5c03b50ef" +dependencies = [ + "proc-macro2", + "quote", + "serde", + "serde_derive", + "syn", +] + +[[package]] +name = "stdweb-internal-macros" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58fa5ff6ad0d98d1ffa8cb115892b6e69d67799f6763e162a1c9db421dc22e11" +dependencies = [ + "base-x", + "proc-macro2", + "quote", + "serde", + "serde_derive", + "serde_json", + "sha1 0.6.1", + "syn", +] + +[[package]] +name = "stdweb-internal-runtime" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" + [[package]] name = "str-buf" version = "1.0.6" @@ -5025,18 +5311,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" [[package]] name = "thiserror" -version = "1.0.34" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1b05ca9d106ba7d2e31a9dab4a64e7be2cce415321966ea3132c49a656e252" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.34" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8f2591983642de85c921015f3f070c665a197ed69e417af436115e3a1407487" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" dependencies = [ "proc-macro2", "quote", @@ -5085,6 +5371,21 @@ dependencies = [ "winapi", ] +[[package]] +name = "time" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4752a97f8eebd6854ff91f1c1824cd6160626ac4bd44287f7f4ea2035a02a242" +dependencies = [ + "const_fn", + "libc", + "standback", + "stdweb", + "time-macros 0.1.1", + "version_check", + "winapi", +] + [[package]] name = "time" version = "0.3.14" @@ -5095,7 +5396,17 @@ dependencies = [ "libc", "num_threads", "serde", - "time-macros", + "time-macros 0.2.4", +] + +[[package]] +name = "time-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1" +dependencies = [ + "proc-macro-hack", + "time-macros-impl", ] [[package]] @@ -5104,6 +5415,19 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +[[package]] +name = "time-macros-impl" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3c141a1b43194f3f56a1411225df8646c55781d5f26db825b3d98507eb482f" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "standback", + "syn", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -5131,9 +5455,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.21.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89797afd69d206ccd11fb0ea560a44bbb87731d020670e79416d442919257d42" +checksum = "d76ce4a75fb488c605c54bf610f221cea8b0dafb53333c1a67e8ee199dcd2ae3" dependencies = [ "autocfg", "bytes", @@ -5141,7 +5465,6 @@ dependencies = [ "memchr", "mio", "num_cpus", - "once_cell", "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", @@ -5360,6 +5683,15 @@ dependencies = [ "unchecked-index", ] +[[package]] +name = "type1-encoding-parser" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b" +dependencies = [ + "pom 1.1.0", +] + [[package]] name = "typenum" version = "1.15.0" diff --git a/Cargo.toml b/Cargo.toml index 278d69296..ce86c4845 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,6 @@ [workspace] members = [ + "atomizer", "server", "cli", "lib", diff --git a/atomizer/Cargo.toml b/atomizer/Cargo.toml new file mode 100644 index 000000000..191a44561 --- /dev/null +++ b/atomizer/Cargo.toml @@ -0,0 +1,13 @@ +[package] +description = "Turn files into Atomic Data." +edition = "2021" +name = "atomizer" +version = "0.1.0" + +[dependencies] +atomic_lib = {version = "0.34.0", path = "../lib"} +kamadak-exif = "0.5.5" +mime_guess = "2.0.4" +# Preferably use the OG version, but we're waiting for a +# https://github.com/jrmuizel/pdf-extract/pull/48 +pdf-extract = {repository = "https://github.com/joepio/pdf-extract/"} diff --git a/atomizer/src/file.rs b/atomizer/src/file.rs new file mode 100644 index 000000000..5b6182223 --- /dev/null +++ b/atomizer/src/file.rs @@ -0,0 +1,58 @@ +use std::{collections::HashMap, error::Error, io::Read}; + +use atomic_lib::resources::PropVals; +use mime_guess::Mime; + +pub struct File { + filename: String, + mime: Mime, + bytes: Vec, +} + +impl File { + pub fn open(filename: &str) -> Result> { + let file = std::fs::File::open(filename)?; + let bytes = std::io::BufReader::new(file) + .bytes() + .collect::, _>>()?; + let mime = mime_guess::from_path(filename).first_or_octet_stream(); + + Ok(File { + filename: filename.to_string(), + mime, + bytes, + }) + } + + pub fn from_filename_bytes(filename: &str, bytes: Vec) -> Result> { + let mime = mime_guess::from_path(filename).first_or_octet_stream(); + + Ok(File { + filename: filename.to_string(), + mime, + bytes, + }) + } + + /// Creates property-value combinations based on the file's contents. + /// Defaults to an empty HashMap if the file type is not supported. + pub fn to_propvals(self) -> PropVals { + match self.mime.to_string().as_str() { + "application/pdf" => crate::pdf::atomize(self), + "image/jpeg" => crate::image::atomize(self), + _ => HashMap::new(), + } + } + + pub fn bytes(&mut self) -> Vec { + self.bytes.clone() + } + + pub fn mime(&self) -> &Mime { + &self.mime + } + + pub fn filename(&self) -> &str { + &self.filename + } +} diff --git a/atomizer/src/image.rs b/atomizer/src/image.rs new file mode 100644 index 000000000..77bf52be1 --- /dev/null +++ b/atomizer/src/image.rs @@ -0,0 +1,62 @@ +use atomic_lib::resources::PropVals; +use exif::{In, Tag}; + +const DATE_TIME: &str = "date_time"; + +// These should map to Atomic Data Properties +fn map_tag(tag: Tag) -> String { + match tag { + Tag::PixelXDimension => "pixel_x_dimension", + Tag::XResolution => "x_resolution", + Tag::ImageDescription => "image_description", + Tag::DateTime => DATE_TIME, + _ => "unknown", + } + .to_string() +} + +/// Extracts the location from an image file's EXIF data. +pub fn atomize(mut file: crate::file::File) -> PropVals { + let mut props = PropVals::new(); + + println!("Reading EXIF data from {}", file.filename()); + + let mut buf_reader = std::io::BufReader::new(std::io::Cursor::new(file.bytes())); + let exif = exif::Reader::new() + .read_from_container(&mut buf_reader) + .unwrap(); + + let tag_list = [ + Tag::PixelXDimension, + Tag::XResolution, + Tag::ImageDescription, + Tag::DateTime, + ]; + + for tag in tag_list { + if let Some(field) = exif.get_field(tag, In::PRIMARY) { + props.insert( + map_tag(tag), + atomic_lib::Value::String(field.display_value().to_string()), + ); + println!("{}: {}", field.tag, field.display_value().with_unit(&exif)); + } + } + + props +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::file::File; + + #[test] + fn load_image() { + let f = File::open("./test/image.jpg").unwrap(); + let propvals = f.to_propvals(); + let dt = propvals.get(DATE_TIME).unwrap(); + println!("Date: {}", dt); + assert!(dt.to_string().contains("2008")); + } +} diff --git a/atomizer/src/lib.rs b/atomizer/src/lib.rs new file mode 100644 index 000000000..8c8b1c161 --- /dev/null +++ b/atomizer/src/lib.rs @@ -0,0 +1,3 @@ +pub mod file; +mod image; +mod pdf; diff --git a/atomizer/src/pdf.rs b/atomizer/src/pdf.rs new file mode 100644 index 000000000..88e370c0d --- /dev/null +++ b/atomizer/src/pdf.rs @@ -0,0 +1,26 @@ +use atomic_lib::resources::PropVals; + +const CONTENT_PROP: &str = atomic_lib::urls::DESCRIPTION; + +/// Extracts the text from a PDF file. +pub fn atomize(mut file: crate::file::File) -> PropVals { + let mut props = PropVals::new(); + let bytes = file.bytes(); + let text = pdf_extract::extract_text_from_mem(&bytes).unwrap(); + props.insert(CONTENT_PROP.into(), atomic_lib::Value::Markdown(text)); + props +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::file::File; + + #[test] + fn load_pdf() { + let f = File::open("./test/docs-demo.pdf").unwrap(); + let propvals = f.to_propvals(); + let content = propvals.get(CONTENT_PROP).unwrap(); + assert!(content.to_string().contains("Atomic Data")); + } +} diff --git a/atomizer/test/docs-demo.pdf b/atomizer/test/docs-demo.pdf new file mode 100644 index 000000000..115c5c7e8 Binary files /dev/null and b/atomizer/test/docs-demo.pdf differ diff --git a/atomizer/test/image.jpg b/atomizer/test/image.jpg new file mode 100644 index 000000000..06171f8c1 Binary files /dev/null and b/atomizer/test/image.jpg differ diff --git a/atomizer/test/simple.pdf b/atomizer/test/simple.pdf new file mode 100644 index 000000000..dbf091df9 Binary files /dev/null and b/atomizer/test/simple.pdf differ diff --git a/lib/src/client.rs b/lib/src/client.rs index 6250c64c0..650493914 100644 --- a/lib/src/client.rs +++ b/lib/src/client.rs @@ -1,6 +1,4 @@ //! Functions for interacting with an Atomic Server -use url::Url; - use crate::{ agents::Agent, commit::sign_message, diff --git a/lib/src/populate.rs b/lib/src/populate.rs index c8d99a7bc..45ecfbb9e 100644 --- a/lib/src/populate.rs +++ b/lib/src/populate.rs @@ -9,7 +9,7 @@ use crate::{ parse::ParseOpts, schema::{Class, Property}, storelike::Query, - urls, Storelike, Value, + urls, Resource, Storelike, Value, }; /// Populates a store with some of the most fundamental Properties and Classes needed to bootstrap the whole. diff --git a/server/Cargo.toml b/server/Cargo.toml index 0324ee32d..ed47fa5a3 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -20,6 +20,7 @@ actix-cors = "0.6" actix-files = "0.6" actix-multipart = "0.4" actix-web-actors = "4" +atomizer = { version = "0.1.0", path = "../atomizer" } base64 = "0.13" chrono = "0.4" colored = "2" diff --git a/server/src/handlers/upload.rs b/server/src/handlers/upload.rs index 9a9d8fb12..7b98f8b84 100644 --- a/server/src/handlers/upload.rs +++ b/server/src/handlers/upload.rs @@ -67,7 +67,7 @@ pub async fn upload_handler( let mut file_path = appstate.config.uploads_path.clone(); file_path.push(&file_id); - let mut file = std::fs::File::create(file_path)?; + let mut file = std::fs::File::create(file_path.clone())?; // Field in turn is stream of *Bytes* object while let Some(chunk) = field.next().await { @@ -87,17 +87,20 @@ pub async fn upload_handler( let download_url = format!("{}/download/{}", store.get_server_url(), subject_path); let mut resource = atomic_lib::Resource::new_instance(urls::FILE, store)?; + let mime = guess_mime_for_filename(filename); resource.set_subject(new_subject); resource.set_propval_string(urls::PARENT.into(), &query.parent, store)?; resource.set_propval_string(urls::INTERNAL_ID.into(), &file_id, store)?; resource.set_propval(urls::FILESIZE.into(), Value::Integer(byte_count), store)?; - resource.set_propval_string( - urls::MIMETYPE.into(), - &guess_mime_for_filename(filename), - store, - )?; + resource.set_propval_string(urls::MIMETYPE.into(), &mime, store)?; resource.set_propval_string(urls::FILENAME.into(), filename, store)?; resource.set_propval_string(urls::DOWNLOAD_URL.into(), &download_url, store)?; + + // Extract data from files, turn into JSON-AD + for (prop, val) in atomizer::file::File::open(&file_path.to_string_lossy())?.to_propvals() { + resource.set_propval(prop, val, store)?; + } + commit_responses.push(resource.save(store)?); created_resources.push(resource); }