-
Notifications
You must be signed in to change notification settings - Fork 111
Qwen 3 vl #359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
… main (adding bfactory fork library)
examples/qwen_3_vl/main.zig
Outdated
| }; | ||
|
|
||
| // Initialize ZML platform | ||
| const create_opts_json = cli.args.@"create-options" orelse "{\"cpu\": {\"device_count\": 1}}"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dude, you broke github
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(this was a joke)
but I don't think that device count 1 should be the default.
Also json is not really nice to type on CLI, it's better if you can switch to zon.
zml/exe.zig
Outdated
| // ev.await(self.platform.pjrt_api) catch unreachable; | ||
| // } | ||
| // } | ||
| for (events[0..sharding.num_partitions]) |e| { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We prefer to keep call asynchronous, you may need to add a few await() in your main to make it work like you want.
| std.ascii.toLower(ext[2]) == 'p'); | ||
|
|
||
| var height: u32 = undefined; | ||
| var width: u32 = undefined; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
avoid this pattern.
you can do:
const image: RgbImage = if (is_bmp) img: {
...
break :img img;
} else img: {
...
break :img img;
};
examples/qwen_3_vl/main.zig
Outdated
| } | ||
| } | ||
| pub fn preprocessor(allocator: std.mem.Allocator, tokenizer: zml.tokenizer.Tokenizer, prompt: []const u8, config: qwen.Qwen.Config, preprocessor_config: PreprocessorConfig, image_path: []const u8, max_seq_len: u32, max_side: u32) !struct { image_buffer_chw: zml.HostBuffer, prompt_tokens: zml.HostBuffer, prompt_shape: zml.HostBuffer, image_dim: zml.HostBuffer, token_index: zml.HostBuffer, h_resized: u32, w_resized: u32 } { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you give a name to this struct ? eg Input ? and put the deinit inside
examples/qwen_3_vl/qwen_3_vl.zig
Outdated
| const merge_size = self.qwen.config.vision_config.spatial_merge_size; | ||
|
|
||
| // Reshape the image to the correct shape for the vision transformer | ||
| image_chw_rescaled_normalized = image_chw_rescaled_normalized.reshape(.{ .temporal_patch_size = temporal_patch_size, .c = 3, .h_div = @divExact(grid_h, merge_size), .m1 = merge_size, .patch1 = patch_size, .w_div = @divExact(grid_w, merge_size), .m2 = merge_size, .patch2 = patch_size }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in general reshape( divisions ) is better served by a .splitAxis: no room for errors
examples/qwen_3_vl/qwen_3_vl.zig
Outdated
|
|
||
| // Reshape the image to the correct shape for the vision transformer | ||
| image_chw_rescaled_normalized = image_chw_rescaled_normalized.reshape(.{ .temporal_patch_size = temporal_patch_size, .c = 3, .h_div = @divExact(grid_h, merge_size), .m1 = merge_size, .patch1 = patch_size, .w_div = @divExact(grid_w, merge_size), .m2 = merge_size, .patch2 = patch_size }); | ||
| image_chw_rescaled_normalized = image_chw_rescaled_normalized.transpose(.{ 2, 5, 3, 6, 1, 0, 4, 7 }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please use the names rather than the offsets
|
|
||
| if (args.steps != 1) { | ||
| res = res.scale(args.steps); | ||
| res = res.scale(range / @as(f64, @floatFromInt(args.steps - 1))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks for the fix
|
Thanks for adding this ! |
examples/qwen_3_vl/qwen_3_vl.zig
Outdated
| const w = @divExact(image_grid_thw[2], 2); | ||
|
|
||
| // Repeat the index along the 3 dimensions based on grid size (after the text (+4 tokens according to the chat template)) | ||
| const t_index = zml.Tensor.iota(Shape.init(.{ .t = t }, .i32), .t).reshape(.{ .t = -1, .hw = 1 }).repeat1d(1, h * w).flatten().addConstant(4); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is what I have:
// Build the 3D positional ids for the vision transformer
fn buildVisionPositionIds(position_ids: Tensor, image_grid_thw: [3]u32) struct { temporal: Tensor, height: Tensor, width: Tensor } {
const t = image_grid_thw[0];
const h = @divExact(image_grid_thw[1], 2);
const w = @divExact(image_grid_thw[2], 2);
const img_position = zml.Tensor.scalar(4, .i32);
// Repeat the index along the 3 dimensions based on grid size (after the text (+4 tokens according to the chat template))
const t_index = zml.Tensor.iota(.init(.{ .t = t, .h = h, .w = w }, .i32), .t).add(img_position).merge(.{ .seq = .{ .t, .h, .w }});
const h_index = zml.Tensor.iota(.init(.{ .h = h, .w = w }, .i32), .h).add(img_position).merge(.{ .seq = .{ .h, .w }});
const w_index = zml.Tensor.iota(.init(.{ .h = h, .w = w }, .i32), .w).add(img_position).merge(.{ .seq = .{ .h, .w }});
// Update the position ids with the 3D positional ids
const temporal_ids = position_ids.scatterSlices(.{ .seq = img_position}, t_index, .{});
const height_ids = position_ids.scatterSlices(.{ .seq = img_position}, h_index, .{});
const width_ids = position_ids.scatterSlices(.{ .seq = img_position}, w_index, .{});
// For each dim, the output is the position ids (from 4 to grid size / merge size) repeated to reach the grid size
return .{
.temporal = temporal_ids,
.height = height_ids,
.width = width_ids,
};
}
But note that something is fishy: h_index and w_index don't have the same shape than t_index. IIUC the python code, t_index has the correct shape.
By using scatterSlices instead of dynamicUpdateSlice it works with/without the batch axis, so no need for unsqueeze.
examples/qwen_3_vl/qwen_3_vl.zig
Outdated
| // Build the 3D positional ids for the vision transformer | ||
| fn buildVisionPositionIds(position_ids: Tensor, image_grid_thw: [3]u32) struct { temporal: Tensor, height: Tensor, width: Tensor } { | ||
| const t = image_grid_thw[0]; | ||
| const h = @divExact(image_grid_thw[1], 2); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why div by 2 ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's div by the spatial merge size (I hardcoded it I fix it). It is the side length of the square block of vision patches merged into a single visual token (that's why the number of token is timeheightwidth/spatial_merge_size^2). So for each side grid_h, grid_w is div to match the number of token image
examples/qwen_3_vl/qwen_3_vl.zig
Outdated
| const w = @divExact(image_grid_thw[2], 2); | ||
|
|
||
| // Repeat the index along the 3 dimensions based on grid size (after the text (+4 tokens according to the chat template)) | ||
| const t_index = zml.Tensor.iota(Shape.init(.{ .t = t }, .i32), .t).reshape(.{ .t = -1, .hw = 1 }).repeat1d(1, h * w).flatten().addConstant(4); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But even beyond that I don't think the code is correct. Here is the output I have for temporal_ids:
info(zml): Device buffer: Buffer({1,512,i32})@738100e800: {
{0,1,2,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408}
},
I think it should read .{ 0, 1, 2, 3, 4, 4, 4, ..., 4, 4, 5, 6, 7, ... } but instead we have this cliff: 4, 4, 4, 21.
Edit:
we checked together the reference implem and it is indeed correct.
examples/qwen3_vl/qwen3_vl.zig
Outdated
| position_ids = position_ids.dynamicUpdateSlice(.{ .seq = zml.Tensor.scalar(0, .i32) }, before_image_positions); | ||
|
|
||
| // Repeat the index along the 3 dimensions based on grid size (after the text (+4 tokens according to the chat template)) | ||
| const t_index = zml.Tensor.iota(.init(.{ .bs = input_ids.dim(.bs), .t = t, .hw = h * w }, .i32), .t).reshape(.{ .bs = input_ids.dim(.bs), .seq = -1 }).add(text_before_image); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use an intermediate variables to avoid repetition. t_index, h_idnex and w_index can all use the same shape for the iota and the reshape.
| return .{ stacked_position_ids, mrope_position_deltas }; | ||
| } | ||
|
|
||
| test "buildVisionPositionIds" { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is way too complicated for a simple test, here is a refactoring (assuming text_with_image is replaced by an integer)
test "buildVisionPositionIds" {
std.debug.print("buildVisionPositionIds test started\n", .{});
const platform = zml.testing.env();
const allocator = std.testing.allocator;
// Parameters
const batch_size: u32 = 1;
const seq_len: u32 = 79;
// Create input buffers
var input_ids_data = try allocator.alloc(i32, batch_size * seq_len);
defer allocator.free(input_ids_data);
for (0..batch_size * seq_len) |i| {
input_ids_data[i] = @intCast(i % seq_len);
}
const input_ids_d = try zml.Buffer.fromSlice(platform, .{ .bs = batch_size, .seq = seq_len }, input_ids_data);
defer input_ids_d.deinit();
const prompt_shape_d = try zml.Buffer.fromSlice(platform, .{ .seq = 3 }, &[_]i32{ 4, 64, 11 });
defer prompt_shape_d.deinit();
// Compile and execute buildVisionPositionIds
const Local = struct {
pub fn positionIds(input_ids: zml.Tensor, prompt_shape: zml.Tensor) zml.Tensor {
return buildVisionPositionIds(2, input_ids, seq_len, prompt_shape, .{ 1, 16, 16 })[0];
}
};
const result = try zml.testing.compileAndCall(
platform,
Local.positionIds,
.{ input_ids_d, prompt_shape_d },
);
defer result.deinit();
const expected = [3][79]i32{
// temporal
.{ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 },
// height
.{ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 },
// width
.{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 },
};
try std.testing.expectEqual(expected, try result.getValue([3][79]i32));
}
examples/qwen3_vl/qwen3_vl.zig
Outdated
| } | ||
|
|
||
| pub fn applyRotaryPositionalEmbedding(q: Tensor, k: Tensor, cos: Tensor, sin: Tensor) struct { Tensor, Tensor } { | ||
| const cos_q_unsqueezed = zml.torch.unsqueeze(cos, -2).broad(q.shape()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do you need unsqueeze before broadcasting ? broadcasting should insert the axis if needed. See zml.nn.rope
examples/qwen3_vl/qwen3_vl.zig
Outdated
|
|
||
| // Interleaved mrope | ||
| // Slice the frequency tensor to get the frequency for the temporal, height and width dimensions | ||
| var freqs_t = freqs.slice1d(0, .{ .start = 0, .end = 1 }).squeeze(0).withTags(.{ .bs, .seq, .dh }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
chunkExact
examples/qwen3_vl/qwen3_vl.zig
Outdated
| // Repeat 3 times (t, h, w) on dim 0, pos id 1dim on dim 1 (number of images I think) (3, batch size, dim_head//2, 1) | ||
| const inv_freq_expanded = inv_freq.reshape(.{ -1, 1 }); | ||
| const position_ids_expanded = position_ids.reshape(.{ 3, @as(u32, @intCast(position_ids.dim(1))), 1, -1 }).convert(.f32); // (3, bs, 1, seq len) | ||
| var freqs = inv_freq_expanded.matmul(position_ids_expanded).transpose(.{ 0, 1, 3, 2 }); // (3, bs, dim_head//2, seq len) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please no matmul use, dot instead and specify the axis you want to contract over. This should avoid the reshapes/transpose.
examples/qwen3_vl/qwen3_vl.zig
Outdated
| // Gather scatter the frequencies to build the tensor such as [t,h,w,t,h,w,...,t,h,w,t,t,t,t] | ||
| const h_input = freqs_h.gather(.{ .dh = h_indices }, .{ .indices_are_sorted = true }); | ||
| const w_input = freqs_w.gather(.{ .dh = w_indices }, .{ .indices_are_sorted = true }); | ||
| freqs_t = freqs_t.transpose(.{ 2, 0, 1 }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
avoid axis as integer, use the names
zml/hostbuffer.zig
Outdated
| const sliced_self = self.slice1d(0, .{ .start = di, .end = di + 1 }).squeeze(0); | ||
| try sliced_self.prettyPrintIndented(writer, num_rows, indent_level + 2, options); | ||
| } | ||
| // if (n < num_rows) return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pls rever
zml/testing.zig
Outdated
| log.info("all good for {s} !", .{name}); | ||
| } | ||
|
|
||
| pub fn testLayerWithoutInput( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would expect testLayer to work with an empty argument list. Is there a bug there ? can we fix that instead of introducing a small variant ?
| image_grid_thw: [3]u32, | ||
| ) struct { Tensor, Tensor } { | ||
| // Get the number of text tokens before the image, the number of image tokens and the number of text tokens after the image | ||
| const text_before_image = prompt_shape.choose1d(0, 0).convert(.i32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the content of prompt_shape should be documented.
And I guess this should also be a chunkExact(3)
examples/qwen3_vl/qwen3_vl.zig
Outdated
| const t = grid_thw[0]; | ||
| const h = grid_thw[1]; | ||
| const w = grid_thw[2]; | ||
| var hpos_ids = zml.torch.unsqueeze(zml.Tensor.arange(.{ .start = 0, .end = h, .step = 1 }, .f32), 1).repeat1d(1, @as(u63, @intCast(w))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
iota
examples/qwen3_vl/qwen3_vl.zig
Outdated
| const h = grid_thw[1]; | ||
| const w = grid_thw[2]; | ||
| var hpos_ids = zml.torch.unsqueeze(zml.Tensor.arange(.{ .start = 0, .end = h, .step = 1 }, .f32), 1).repeat1d(1, @as(u63, @intCast(w))); | ||
| hpos_ids = hpos_ids.reshape(.{ .h_div = @divExact(h, m_size), .m1 = m_size, .w_div = @divExact(w, m_size), .m2 = m_size }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
splitAxis
examples/qwen3_vl/qwen3_vl.zig
Outdated
|
|
||
| pub const VisionRotaryEmbedding = struct { | ||
| rope_opts: zml.nn.RopeOpts, | ||
| dim: u32, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is the dim used for ?
examples/qwen3_vl/qwen3_vl.zig
Outdated
| rope_opts: zml.nn.RopeOpts, | ||
| dim: u32, | ||
|
|
||
| pub fn init(allocator: std.mem.Allocator, dim: u32, theta: f32) !VisionRotaryEmbedding { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why allocatoor ?
my feeling is that this struct should not exist, put the options directly into the parent struct,
and the forward can be inlined or maybe a helper function.
examples/qwen3_vl/main.zig
Outdated
| const prompt_buffer = try allocator.alloc(u32, max_seq_len); | ||
|
|
||
| // Create the HostBuffers for the prompt, prompt shape, image size and token index | ||
| @memcpy(prompt_buffer[0..prompt_encoded.len], prompt_encoded); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why copy here ? you could reuse the already allocated memory ?
examples/qwen3_vl/main.zig
Outdated
| const number_image_pad_tokens = 1 * (@as(u32, @intFromFloat(h_resized)) / patch_size) * (@as(u32, @intFromFloat(w_resized)) / patch_size) / std.math.pow(u32, config.vision_config.spatial_merge_size, 2); | ||
|
|
||
| // Apply the chat template to the prompt | ||
| const result = try applyChatTemplate(allocator, tokenizer, prompt, number_image_pad_tokens); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
result name is reserved for the value you intend to return, so this is misleading.
examples/qwen3_vl/main.zig
Outdated
| // Apply the chat template to the prompt | ||
| const result = try applyChatTemplate(allocator, tokenizer, prompt, number_image_pad_tokens); | ||
| const prompt_encoded = result.prompt_tokens; | ||
| const prompt_shape = result.prompt_shape; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
avoid this alias only used once or twice.
examples/qwen3_vl/main.zig
Outdated
| const prompt_shape_buffer = try zml.HostBuffer.empty(allocator, zml.Shape.init(.{ .chw = 3 }, .i32)); | ||
| @memcpy(prompt_shape_buffer.mutItems(i32), &prompt_shape); | ||
| const image_size_buffer = try zml.HostBuffer.empty(allocator, zml.Shape.init(.{ .chw = 3 }, .i32)); | ||
| @memcpy(image_size_buffer.mutItems(i32), &image_size); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when a variable (like image_size) is only used once, prefer inlining it's definition to where it's used.
In any case keep the definition just above where it's used, so I don't have to scroll up to understand the code.
| @memcpy(prompt_shape_buffer.mutItems(i32), &prompt_shape); | ||
| const image_size_buffer = try zml.HostBuffer.empty(allocator, zml.Shape.init(.{ .chw = 3 }, .i32)); | ||
| @memcpy(image_size_buffer.mutItems(i32), &image_size); | ||
| const token_index_buffer = try zml.HostBuffer.empty(allocator, zml.Shape.init(.{}, .i64)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
feels like we should have a fromArray that allocates and copy the array content and rename current fromArray to fromArrayPtr
Qwen3VL bf16 example implementation based on the implementation of transformers for Qwen3VL 4B. Compile with a known seq length and a known resize shape for the image.
Support BMP (own implementation), jpg et png by using zignal library (fork) (added in third_party).