Skip to content
Merged
4 changes: 2 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
target:
- aarch64-unknown-linux-gnu
- i686-unknown-linux-gnu
- mips64-unknown-linux-gnuabi64
- powerpc-unknown-linux-gnu
steps:
- uses: actions/checkout@v1
- uses: actions-rs/toolchain@v1
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: 1.54.0
toolchain: 1.66.0
override: true
- run: rustup component add clippy
- uses: actions-rs/cargo@v1
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.*
*.bk
target
/Cargo.lock
25 changes: 11 additions & 14 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,46 +1,43 @@
[package]
name = "finalfusion"
version = "0.17.1"
edition = "2018"
authors = ["Daniël de Kok <[email protected]>", "Sebastian Pütz <[email protected]>"]
version = "0.18.0"
edition = "2021"
rust-version = "1.66"
description = "Reader and writer for common word embedding formats"
documentation = "https://docs.rs/finalfusion/"
keywords = ["embeddings", "word2vec", "glove", "finalfusion", "fasttext"]
homepage = "https://github.com/finalfusion/finalfusion-rust"
repository = "https://github.com/finalfusion/finalfusion-rust"
license = "MIT OR Apache-2.0"
readme = "README.md"
exclude = [
".gitignore",
".travis.yml"
]
exclude = [".gitignore"]

[dependencies]
byteorder = "1"
fnv = "1"
itertools = "0.10"
itertools = "0.11"
murmur3 = "0.5"
ndarray = "0.15"
ordered-float = "2"
ndarray = { version = "0.15", features = ["approx-0_5"] }
ordered-float = "4"
rand = "0.8"
rand_chacha = "0.3"
reductive = "0.9"
serde = { version = "1", features = ["derive"] }
smallvec = "1.7"
thiserror = "1"
toml = "0.5"
toml = "0.8"

[dependencies.memmap2]
version = "0.5"
version = "0.9"
optional = true

[features]
default = ["memmap"]
memmap = ["memmap2"]

[dev-dependencies]
approx = "0.4"
criterion = "0.3"
approx = "0.5"
criterion = "0.5"
lazy_static = "1"
maplit = "1"
tempfile = "3"
Expand Down
12 changes: 6 additions & 6 deletions benches/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@ fn allround_iter() -> impl Iterator<Item = String> + Clone {
corpus.into_iter()
}

fn known_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn known_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Word(_)) => Some(w),
_ => None,
})
}

fn unknown_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn unknown_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Subword(_)) => Some(w),
_ => None,
Expand Down
12 changes: 6 additions & 6 deletions benches/quantized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@ fn allround_iter() -> impl Iterator<Item = String> + Clone {
corpus.into_iter()
}

fn known_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn known_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Word(_)) => Some(w),
_ => None,
})
}

fn unknown_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn unknown_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Subword(_)) => Some(w),
_ => None,
Expand Down
1 change: 0 additions & 1 deletion benches/subword.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ fn subwords(string: &str, min_n: usize, max_n: usize, indexer: &impl Indexer) ->
// evaluates them.
string
.subword_indices(min_n, max_n, indexer)
.into_iter()
.fold(0, |sum, v| sum.wrapping_add(v))
}

Expand Down
1 change: 0 additions & 1 deletion src/chunks/io.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryFrom;
use std::fmt::{self, Display};
use std::fs::File;
use std::io::{BufReader, Read, Seek, Write};
Expand Down
18 changes: 9 additions & 9 deletions src/chunks/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::mem;
use std::ops::{Deref, DerefMut};

use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use toml::Value;
use toml::Table;

use crate::chunks::io::{ChunkIdentifier, Header, ReadChunk, WriteChunk};
use crate::error::{Error, Result};
Expand All @@ -16,18 +16,18 @@ use crate::io::ReadMetadata;
/// finalfusion metadata in TOML format.
#[derive(Clone, Debug, PartialEq)]
pub struct Metadata {
inner: Value,
inner: Table,
}

impl Metadata {
/// Construct new `Metadata`.
pub fn new(data: Value) -> Self {
Metadata { inner: data }
pub fn new(inner: Table) -> Self {
Metadata { inner }
}
}

impl Deref for Metadata {
type Target = Value;
type Target = Table;

fn deref(&self) -> &Self::Target {
&self.inner
Expand All @@ -40,9 +40,9 @@ impl DerefMut for Metadata {
}
}

impl From<Value> for Metadata {
fn from(value: Value) -> Self {
Metadata { inner: value }
impl From<Table> for Metadata {
fn from(inner: Table) -> Self {
Metadata { inner }
}
}

Expand All @@ -69,7 +69,7 @@ impl ReadChunk for Metadata {

Ok(Metadata::new(
buf_str
.parse::<Value>()
.parse::<Table>()
.map_err(|e| Error::Format(format!("Cannot deserialize TOML metadata: {}", e)))
.map_err(Error::from)?,
))
Expand Down
7 changes: 3 additions & 4 deletions src/chunks/norms.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
//! Norms chunk

use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::mem;
use std::mem::size_of;
Expand Down Expand Up @@ -71,7 +70,7 @@ impl ReadChunk for NdNorms {
f32::ensure_data_type(read)?;

let n_padding =
padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
Expand Down Expand Up @@ -109,12 +108,12 @@ impl WriteChunk for NdNorms {
write
.write_u32::<LittleEndian>(ChunkIdentifier::NdNorms as u32)
.map_err(|e| Error::write_error("Cannot write norms chunk identifier", e))?;
let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(write.stream_position().map_err(|e| {
Error::write_error("Cannot get file position for computing padding", e)
})?);

let remaining_chunk_len =
self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| {
self.chunk_len(write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?) - (size_of::<u32>() + size_of::<u64>()) as u64;

Expand Down
14 changes: 6 additions & 8 deletions src/chunks/storage/array.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::mem;
use std::mem::size_of;
Expand All @@ -13,7 +12,6 @@ use crate::util::padding;

#[cfg(feature = "memmap")]
mod mmap {
use std::convert::TryInto;
use std::fs::File;
#[cfg(target_endian = "little")]
use std::io::Write;
Expand Down Expand Up @@ -134,15 +132,15 @@ mod mmap {
// The components of the embedding matrix should be of type f32.
f32::ensure_data_type(read)?;

let n_padding = padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
.map_err(|e| Error::read_error("Cannot skip padding", e))?;

// Set up memory mapping.
let matrix_len = shape.size() * size_of::<f32>();
let offset = read.seek(SeekFrom::Current(0)).map_err(|e| {
let offset = read.stream_position().map_err(|e| {
Error::read_error(
"Cannot get file position for memory mapping embedding matrix",
e,
Expand All @@ -153,7 +151,7 @@ mod mmap {
mmap_opts
.offset(offset)
.len(matrix_len)
.map(&*read.get_ref())
.map(read.get_ref())
.map_err(|e| Error::read_error("Cannot memory map embedding matrix", e))?
};

Expand Down Expand Up @@ -218,13 +216,13 @@ impl NdArray {
write
.write_u32::<LittleEndian>(ChunkIdentifier::NdArray as u32)
.map_err(|e| Error::write_error("Cannot write embedding matrix chunk identifier", e))?;
let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(write.stream_position().map_err(|e| {
Error::write_error("Cannot get file position for computing padding", e)
})?);

let remaining_chunk_len = Self::chunk_len(
data.view(),
write.seek(SeekFrom::Current(0)).map_err(|e| {
write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?,
) - (size_of::<u32>() + size_of::<u64>()) as u64;
Expand Down Expand Up @@ -346,7 +344,7 @@ impl ReadChunk for NdArray {
f32::ensure_data_type(read)?;

let n_padding =
padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
Expand Down
11 changes: 5 additions & 6 deletions src/chunks/storage/quantized.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::mem;
use std::mem::size_of;
Expand Down Expand Up @@ -117,7 +116,7 @@ impl QuantizedArray {
f32::ensure_data_type(read)?;

let n_padding =
padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
Expand Down Expand Up @@ -171,12 +170,12 @@ impl QuantizedArray {
quantizer,
quantized.view(),
norms,
write.seek(SeekFrom::Current(0)).map_err(|e| {
write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?,
) - (size_of::<u32>() + size_of::<u64>()) as u64;

let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(write.stream_position().map_err(|e| {
Error::write_error("Cannot get file position for computing padding", e)
})?);

Expand Down Expand Up @@ -562,7 +561,7 @@ mod mmap {
n_embeddings: usize,
quantized_len: usize,
) -> Result<Mmap> {
let offset = read.seek(SeekFrom::Current(0)).map_err(|e| {
let offset = read.stream_position().map_err(|e| {
Error::read_error(
"Cannot get file position for memory mapping embedding matrix",
e,
Expand All @@ -574,7 +573,7 @@ mod mmap {
mmap_opts
.offset(offset)
.len(matrix_len)
.map(&*read.get_ref())
.map(read.get_ref())
.map_err(|e| {
Error::read_error("Cannot memory map quantized embedding matrix", e)
})?
Expand Down
9 changes: 4 additions & 5 deletions src/chunks/storage/wrappers.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryFrom;
#[cfg(feature = "memmap")]
use std::fs::File;
#[cfg(feature = "memmap")]
Expand Down Expand Up @@ -126,7 +125,7 @@ impl ReadChunk for StorageWrap {
R: Read + Seek,
{
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down Expand Up @@ -156,7 +155,7 @@ impl ReadChunk for StorageWrap {
impl MmapChunk for StorageWrap {
fn mmap_chunk(read: &mut BufReader<File>) -> Result<Self> {
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down Expand Up @@ -306,7 +305,7 @@ impl ReadChunk for StorageViewWrap {
R: Read + Seek,
{
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down Expand Up @@ -361,7 +360,7 @@ impl WriteChunk for StorageViewWrap {
impl MmapChunk for StorageViewWrap {
fn mmap_chunk(read: &mut BufReader<File>) -> Result<Self> {
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down
5 changes: 2 additions & 3 deletions src/chunks/vocab/simple.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::HashMap;
use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::io::{Read, Seek, Write};
use std::mem::size_of;

use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
Expand Down Expand Up @@ -103,7 +102,7 @@ impl WriteChunk for SimpleVocab {
.map_err(|e| Error::write_error("Cannot write vocabulary chunk identifier", e))?;

let remaining_chunk_len =
self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| {
self.chunk_len(write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?) - (size_of::<u32>() + size_of::<u64>()) as u64;

Expand Down
Loading