Skip to content

Commit cf69690

Browse files
Merge branch 'release/1.0' into cherry-pick-14194-by-pytorch_bot_bot_
2 parents ec2d067 + b0294e2 commit cf69690

File tree

6 files changed

+177
-105
lines changed

6 files changed

+177
-105
lines changed

examples/models/llava/main.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -75,24 +75,20 @@ void load_image(const std::string& image_path, Image& image) {
7575
new_height,
7676
0,
7777
channels);
78-
// transpose to CHW
79-
image.data.resize(channels * new_width * new_height);
78+
std::vector<uint8_t> chw_data(channels * new_width * new_height);
8079
for (int i = 0; i < new_width * new_height; ++i) {
8180
for (int c = 0; c < channels; ++c) {
82-
image.data[c * new_width * new_height + i] =
83-
resized_data[i * channels + c];
81+
chw_data[c * new_width * new_height + i] = resized_data[i * channels + c];
8482
}
8583
}
86-
image.width = new_width;
87-
image.height = new_height;
88-
image.channels = channels;
84+
image = Image(std::move(chw_data), new_width, new_height, channels);
8985
// convert to tensor
9086
ET_LOG(
9187
Info,
9288
"image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
93-
image.channels,
94-
image.height,
95-
image.width);
89+
image.channels(),
90+
image.height(),
91+
image.width());
9692
stbi_image_free(data);
9793
}
9894

examples/models/llava/runner/llava_image_prefiller.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
namespace example {
1818

19-
using executorch::extension::llm::kImageEncoderMethod;
2019
using executorch::extension::llm::kTextModelMethod;
20+
using executorch::extension::llm::kVisionEncoderMethod;
2121

2222
class ET_EXPERIMENTAL LlavaImagePrefiller {
2323
public:
@@ -34,12 +34,12 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
3434
::executorch::extension::llm::Image& image,
3535
int64_t& start_pos) {
3636
auto image_tensor = executorch::extension::from_blob(
37-
image.data.data(),
38-
{3, image.height, image.width},
37+
image.get_uint8_data().data(),
38+
{3, image.height(), image.width()},
3939
::executorch::aten::ScalarType::Byte);
4040
// Run image encoder
4141
auto image_encoder_outputs =
42-
ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
42+
ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
4343

4444
// inputs:[start_pos, embeds]
4545
auto start_pos_tensor = executorch::extension::from_blob(
@@ -67,7 +67,7 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
6767
if (is_method_loaded()) {
6868
return ::executorch::runtime::Error::Ok;
6969
}
70-
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
70+
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
7171
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
7272
return ::executorch::runtime::Error::Ok;
7373
}
@@ -83,7 +83,7 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
8383
ET_CHECK_MSG(false, "Failed to get method names");
8484
}
8585
std::unordered_set<std::string> methods = methods_res.get();
86-
bool methods_exist = methods.find(kImageEncoderMethod) != methods.end() &&
86+
bool methods_exist = methods.find(kVisionEncoderMethod) != methods.end() &&
8787
methods.find(kTextModelMethod) != methods.end();
8888
if (!methods_exist) {
8989
for (const auto& method : methods) {
@@ -92,10 +92,10 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
9292
ET_CHECK_MSG(
9393
methods_exist,
9494
"Missing required methods (%s, %s) in the model",
95-
kImageEncoderMethod,
95+
kVisionEncoderMethod,
9696
kTextModelMethod);
9797
}
98-
bool methods_loaded = module_->is_method_loaded(kImageEncoderMethod) &&
98+
bool methods_loaded = module_->is_method_loaded(kVisionEncoderMethod) &&
9999
module_->is_method_loaded(kTextModelMethod);
100100
return methods_loaded;
101101
}

extension/android/jni/jni_layer_llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
268268
for (int i = 0; i < image_size; i++) {
269269
image_data[i] = image_data_jint[i];
270270
}
271-
llm::Image image_runner{image_data, width, height, channels};
271+
llm::Image image_runner{std::move(image_data), width, height, channels};
272272
prefill_inputs_.emplace_back(
273273
llm::MultimodalInput{std::move(image_runner)});
274274
}

extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -172,12 +172,12 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
172172
case ExecuTorchLLMMultimodalInputTypeImage: {
173173
ExecuTorchLLMImage *image = input.image;
174174
std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
175-
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{
176-
.data = std::move(data),
177-
.width = (int32_t)image.width,
178-
.height = (int32_t)image.height,
179-
.channels = (int32_t)image.channels
180-
}));
175+
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
176+
std::move(data),
177+
(int32_t)image.width,
178+
(int32_t)image.height,
179+
(int32_t)image.channels
180+
)));
181181
break;
182182
}
183183
default: {

extension/llm/runner/image.h

Lines changed: 98 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,112 @@
1010

1111
#pragma once
1212
#include <executorch/runtime/platform/compiler.h>
13+
#include <cstddef>
1314
#include <cstdint>
15+
#include <variant>
1416
#include <vector>
1517

18+
#include <executorch/extension/tensor/tensor.h>
19+
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
20+
1621
namespace executorch {
1722
namespace extension {
1823
namespace llm {
1924

20-
struct ET_EXPERIMENTAL Image {
25+
class ET_EXPERIMENTAL Image {
26+
public:
27+
// Default constructor
28+
Image() : width_(0), height_(0), channels_(0) {}
29+
30+
// Constructor for uint8_t data
31+
Image(
32+
std::vector<uint8_t>&& data,
33+
int32_t width,
34+
int32_t height,
35+
int32_t channels)
36+
: data_(std::move(data)),
37+
width_(width),
38+
height_(height),
39+
channels_(channels) {}
40+
41+
// Constructor for float data
42+
Image(
43+
std::vector<float>&& data,
44+
int32_t width,
45+
int32_t height,
46+
int32_t channels)
47+
: data_(std::move(data)),
48+
width_(width),
49+
height_(height),
50+
channels_(channels) {}
51+
52+
// Getters
53+
int32_t width() const {
54+
return width_;
55+
}
56+
int32_t height() const {
57+
return height_;
58+
}
59+
int32_t channels() const {
60+
return channels_;
61+
}
62+
63+
// Data access
64+
bool is_uint8() const {
65+
return std::holds_alternative<std::vector<uint8_t>>(data_);
66+
}
67+
68+
bool is_float() const {
69+
return std::holds_alternative<std::vector<float>>(data_);
70+
}
71+
72+
const std::vector<uint8_t>& get_uint8_data() const& {
73+
return std::get<std::vector<uint8_t>>(data_);
74+
}
75+
76+
std::vector<uint8_t>& get_uint8_data() & {
77+
return std::get<std::vector<uint8_t>>(data_);
78+
}
79+
80+
const std::vector<float>& get_float_data() const& {
81+
return std::get<std::vector<float>>(data_);
82+
}
83+
84+
std::vector<float>& get_float_data() & {
85+
return std::get<std::vector<float>>(data_);
86+
}
87+
88+
executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
89+
bool with_batch = false) const {
90+
// Note: This creates a 3D tensor (CHW). The model might expect a 4D
91+
// tensor (NCHW). The caller should handle reshaping if needed.
92+
std::vector<executorch::aten::SizesType> sizes = {
93+
channels(), height(), width()};
94+
if (with_batch) {
95+
sizes.insert(sizes.begin(), 1);
96+
}
97+
if (is_float()) {
98+
return executorch::extension::from_blob(
99+
const_cast<float*>(get_float_data().data()),
100+
sizes,
101+
::executorch::aten::ScalarType::Float);
102+
} else if (is_uint8()) {
103+
return executorch::extension::from_blob(
104+
const_cast<uint8_t*>(get_uint8_data().data()),
105+
sizes,
106+
::executorch::aten::ScalarType::Byte);
107+
}
108+
ET_LOG(
109+
Error, "Image data is not initialized with uint8_t or float vector.");
110+
return ::executorch::runtime::Error::NotSupported;
111+
}
112+
113+
private:
21114
// Assuming NCHW format
22-
std::vector<uint8_t> data;
23-
int32_t width;
24-
int32_t height;
25-
int32_t channels;
115+
std::variant<std::vector<uint8_t>, std::vector<float>> data_;
116+
int32_t width_;
117+
int32_t height_;
118+
int32_t channels_;
26119
};
27120

28121
} // namespace llm

0 commit comments

Comments
 (0)