Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Cydonia is a library based on [candle][candle] for developing modern AI applicat

```rust
use cydonia::Model;

fn main() {
let model = Model::new("gemma2").tag("latest");
let response = model.invoke("Hello, world!");
Expand All @@ -13,12 +14,20 @@ fn main() {

We support quantized models only derived from `gemma` and `llama` family.

## Special Thanks
## TODOs

- [candle][candle]
- [ollama][ollama]
- [x] Support chat interface ( history prompts )
- [ ] Function encoder for llama3 tools (static)
- [ ] Cydonia as service
- [ ] RPC support for llama3 tools (remote)
- [ ] GraphQL support for llama3 tools (remote)
- [ ] RAG support
- [ ] Agent interface
- [ ] Multi-agent support (single-node)
- [ ] An application based on the tools
- [ ] p2p for the decentralized cydonia network (multi-node)
- [ ] Test gpu

<!-- links -->

[candle]: https://github.com/huggingface/candle
[ollama]: https://github.com/ollama/ollama
12 changes: 12 additions & 0 deletions crates/candle/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,15 @@ rand.workspace = true
serde.workspace = true
tokenizers.workspace = true
tracing.workspace = true

[features]
accelerate = [
"candle-core/accelerate",
"candle-nn/accelerate",
"candle-transformers/accelerate",
]

[target.'cfg(target = "aarch64-apple-darwin")'.dependencies]
candle-core = { workspace = true, features = ["accelerate"] }
candle-nn = { workspace = true, features = ["accelerate"] }
candle-transformers = { workspace = true, features = ["accelerate"] }
38 changes: 38 additions & 0 deletions crates/candle/examples/llama.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use ccore::{Message, Release};
use cydonia_candle::{Llama, ProcessorConfig};
use std::io::Write;

fn main() {
let mut model = Llama::new(ProcessorConfig::default(), Release::default()).unwrap();
let mut init = true;
loop {
print!("> ");
std::io::stdout().flush().unwrap();

// Read input
let mut input = String::new();
std::io::stdin().read_line(&mut input).unwrap();
if input.ends_with('\n') {
input.pop();
if input.ends_with('\r') {
input.pop();
}
}

// Generate response
let mut response = String::new();
let message = Message::user(input);
let stream = model
.complete(&[message], init)
.expect("failed to generate response");
for token in stream {
response.push_str(&token);

print!("{}", token);
std::io::stdout().flush().unwrap();
}
println!();

init = false;
}
}
28 changes: 24 additions & 4 deletions crates/candle/src/inference.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,33 @@
//! Cydonia inference interface
use std::fs::File;

use anyhow::Result;
use candle_core::{quantized::gguf_file::Content, Device, Tensor};
use candle_transformers::models::quantized_llama;
use ccore::{chat, Message};
use std::fs::File;

/// The inference interface for language models
pub trait Inference: Sized {
/// The max sequence length
const MAX_SEQ_LEN: usize;

/// The formatter for the model
type Formatter: chat::Formatter;

/// The end of stream token
fn eos_token() -> &'static str {
<Self::Formatter as chat::Formatter>::EOS_TOKEN
}

/// Format the messages into a prompt
fn prompt(messages: &[Message]) -> Result<String> {
<Self::Formatter as chat::Formatter>::format(messages)
}

/// Complete the messages
fn complete(messages: &[Message]) -> Result<String> {
<Self::Formatter as chat::Formatter>::complete(messages)
}

/// Load model from gguf file
fn gguf(device: &Device, file: &mut File) -> Result<Self>;

Expand All @@ -20,14 +38,16 @@ pub trait Inference: Sized {
impl Inference for quantized_llama::ModelWeights {
const MAX_SEQ_LEN: usize = quantized_llama::MAX_SEQ_LEN;

type Formatter = chat::Llama3;

fn gguf(device: &Device, file: &mut File) -> Result<Self> {
let content = Content::read(file)?;
let model = Self::from_gguf(content, file, device)?;
Ok(model)
}

fn forward(&mut self, input: &Tensor, squeeze: usize) -> Result<Tensor> {
quantized_llama::ModelWeights::forward(self, input, squeeze)
fn forward(&mut self, input: &Tensor, pos: usize) -> Result<Tensor> {
quantized_llama::ModelWeights::forward(self, input, pos)
.map_err(|e| anyhow::anyhow!("failed to forward: {e}"))
}
}
4 changes: 2 additions & 2 deletions crates/candle/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ mod inference;
mod loader;
mod model;
mod processor;
mod stream;
mod token;

pub use {
device::detect as device,
inference::Inference,
loader::Loader,
model::Model,
processor::{Processor, ProcessorConfig, SampleBuilder},
stream::TokenStream,
token::{TokenStream, Tokenizer},
};

/// The Llama model
Expand Down
21 changes: 10 additions & 11 deletions crates/candle/src/loader.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
//! Model loader

use crate::{Inference, TokenStream};
use crate::{Inference, Tokenizer};
use anyhow::Result;
use candle_core::Device;
use ccore::{Manifest, TOKENIZER};
use ccore::{Release, TOKENIZER};
use hf_hub::api::sync::Api;
use std::fs::File;
use tokenizers::Tokenizer;

/// Huggingface model loader
///
Expand All @@ -16,30 +15,30 @@ pub struct Loader {
api: Api,

/// The manifest of the model
manifest: Manifest,
release: Release,
}

impl Loader {
/// Load the model
pub fn new(manifest: Manifest) -> Result<Self> {
pub fn new(release: Release) -> Result<Self> {
Ok(Self {
manifest,
release,
api: Api::new()?,
})
}

/// Load the tokenizer
pub fn tokenizer(&self) -> Result<TokenStream> {
pub fn tokenizer<I: Inference>(&self) -> Result<Tokenizer> {
let trepo = self.api.model(TOKENIZER.into());
let tokenizer = Tokenizer::from_file(trepo.get(self.manifest.release.tokenizer())?)
let tokenizer = tokenizers::Tokenizer::from_file(trepo.get(self.release.tokenizer())?)
.map_err(|e| anyhow::anyhow!("failed to load tokenizer: {e}"))?;
Ok(TokenStream::new(tokenizer))
Tokenizer::new::<I>(tokenizer)
}

/// Load the model
pub fn model<M: Inference>(&self, device: &Device) -> Result<M> {
let mrepo = self.api.model(self.manifest.release.repo()?.into());
let model = mrepo.get(&self.manifest.release.model(self.manifest.quantization))?;
let mrepo = self.api.model(self.release.repo().into());
let model = mrepo.get(&self.release.model())?;
let mut file = File::open(model)?;
let model = M::gguf(device, &mut file)?;
Ok(model)
Expand Down
75 changes: 19 additions & 56 deletions crates/candle/src/model.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
//! Model interface

use crate::{Inference, Loader, Processor, ProcessorConfig, TokenStream};
use crate::{Inference, Loader, Processor, ProcessorConfig, TokenStream, Tokenizer};
use anyhow::Result;
use ccore::{Manifest, Message};
use std::io::Write;
use ccore::{Message, Release};

/// Language Model interface
pub struct Model<I: Inference> {
/// The tokenizer of the model
tokenizer: TokenStream,
tokenizer: Tokenizer,

/// The weights of the model
weights: I,
Expand All @@ -19,10 +18,10 @@ pub struct Model<I: Inference> {

impl<I: Inference> Model<I> {
/// Create a new model
pub fn new(config: ProcessorConfig, manifest: Manifest) -> Result<Self> {
let loader = Loader::new(manifest)?;
let tokenizer = loader.tokenizer()?;
pub fn new(config: ProcessorConfig, release: Release) -> Result<Self> {
let processor = config.build();
let loader = Loader::new(release)?;
let tokenizer = loader.tokenizer::<I>()?;
let weights = loader.model::<I>(&processor.device)?;

Ok(Self {
Expand All @@ -33,54 +32,18 @@ impl<I: Inference> Model<I> {
}

/// Complete the chat
pub fn complete(&mut self, messages: &mut [Message]) -> Result<String> {
let message = messages
.first()
.ok_or_else(|| anyhow::anyhow!("no messages"))?;

let to_sample = self.processor.sample_len.saturating_sub(1);
let prompt_tokens = self
.tokenizer
.prompt(&message.content)?
.sample_len(to_sample)
.max_seq_len::<I>()
.encode()?;

// process the prompt tokens
let mut next_token = self
.processor
.sample_tokens(&prompt_tokens)
.sample(&mut self.weights)?;

// process the tokens
let mut all_tokens = vec![next_token];
let eos_token = self
.tokenizer
.token("</s>")
.ok_or_else(|| anyhow::anyhow!("eos token not found"))?;

let response = String::new();
let pos = prompt_tokens.len();
for index in 0..to_sample {
next_token = self
.processor
.sample_tokens(&[next_token])
.all_tokens(&all_tokens)
.pos(pos + index)
.sample(&mut self.weights)?;

all_tokens.push(next_token);
if let Some(t) = self.tokenizer.next_token(next_token)? {
print!("{t}");
std::io::stdout().flush()?;
}

if next_token == eos_token {
break;
}
}

println!();
Ok(response)
pub fn complete<'ts>(
&'ts mut self,
messages: &[Message],
init: bool,
) -> Result<TokenStream<'ts, I>> {
let formatted = if init {
I::prompt(messages)?
} else {
I::complete(messages)?
};

self.tokenizer
.stream(&mut self.weights, &mut self.processor, formatted)
}
}
6 changes: 4 additions & 2 deletions crates/candle/src/processor/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ impl ProcessorConfig {
}

/// Set the sample length
///
/// TODO: if there is a way to embed the sample length in the system prompt?
pub fn sample_len(mut self, sample_len: usize) -> Self {
self.sample_len = sample_len;
self
Expand All @@ -112,11 +114,11 @@ impl Default for ProcessorConfig {
fn default() -> Self {
Self {
gpu: false,
seed: Some(1_024_243_212),
seed: None,
temperature: Some(0.6),
top_p: Some(0.9),
top_k: Some(50),
sample_len: 256,
sample_len: 1024,
repeat_penalty: 1.0,
repeat_last_n: 64,
}
Expand Down
4 changes: 2 additions & 2 deletions crates/candle/src/processor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ impl Processor {
}

/// Sample tokens
pub fn sample_tokens<'s>(&'s mut self, tokens: &'s [u32]) -> SampleBuilder<'s> {
SampleBuilder::new(self, tokens)
pub fn sample_token(&mut self, token: u32) -> SampleBuilder<'_> {
SampleBuilder::new(self, token)
}

/// Apply repeat penalty
Expand Down
Loading
Loading