Thanks to visit codestin.com
Credit goes to www.ffmpeg.org
FFmpeg
libavutil
mips
generic_macros_msa.h
Go to the documentation of this file.
1
/*
2
* Copyright (c) 2015 Manojkumar Bhosale (
[email protected]
)
3
*
4
* This file is part of FFmpeg.
5
*
6
* FFmpeg is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
10
*
11
* FFmpeg is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
15
*
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with FFmpeg; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
*/
20
21
#ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22
#define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23
24
#include <stdint.h>
25
#include <msa.h>
26
#include <config.h>
27
28
#define ALIGNMENT 16
29
#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30
31
#define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
32
#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
33
#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
34
#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
35
#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
36
#define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
37
#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
38
39
#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40
#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
41
#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
42
#define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
43
#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
44
#define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
45
#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
46
47
#if HAVE_MIPS32R6 || HAVE_MIPS64R6
48
#define LH(psrc) \
49
( { \
50
uint16_t val_lh_m = *(uint16_t *)(psrc); \
51
val_lh_m; \
52
} )
53
54
#define LW(psrc) \
55
( { \
56
uint32_t val_lw_m = *(uint32_t *)(psrc); \
57
val_lw_m; \
58
} )
59
60
#if (__mips == 64)
61
#define LD(psrc) \
62
( { \
63
uint64_t val_ld_m = *(uint64_t *)(psrc); \
64
val_ld_m; \
65
} )
66
#else // !(__mips == 64)
67
#define LD(psrc) \
68
( { \
69
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
70
uint32_t val0_ld_m, val1_ld_m; \
71
uint64_t val_ld_m = 0; \
72
\
73
val0_ld_m = LW(psrc_ld_m); \
74
val1_ld_m = LW(psrc_ld_m + 4); \
75
\
76
val_ld_m = (uint64_t) (val1_ld_m); \
77
val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
78
val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
79
\
80
val_ld_m; \
81
} )
82
#endif // (__mips == 64)
83
84
#define SH(val, pdst) *(uint16_t *)(pdst) = (val);
85
#define SW(val, pdst) *(uint32_t *)(pdst) = (val);
86
#define SD(val, pdst) *(uint64_t *)(pdst) = (val);
87
88
#else // !HAVE_MIPS32R6 && !HAVE_MIPS64R6
89
#define LH(psrc) \
90
( { \
91
uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
92
uint16_t val_lh_m; \
93
\
94
__asm__ volatile ( \
95
"ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
96
\
97
: [val_lh_m] "=r" (val_lh_m) \
98
: [psrc_lh_m] "m" (*psrc_lh_m) \
99
); \
100
\
101
val_lh_m; \
102
} )
103
104
#define LW(psrc) \
105
( { \
106
uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
107
uint32_t val_lw_m; \
108
\
109
__asm__ volatile ( \
110
"lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
111
"lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
112
\
113
: [val_lw_m] "=&r"(val_lw_m) \
114
: [psrc_lw_m] "r"(psrc_lw_m) \
115
); \
116
\
117
val_lw_m; \
118
} )
119
120
#if (__mips == 64)
121
#define LD(psrc) \
122
( { \
123
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
124
uint64_t val_ld_m = 0; \
125
\
126
__asm__ volatile ( \
127
"ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
128
"ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
129
\
130
: [val_ld_m] "=&r" (val_ld_m) \
131
: [psrc_ld_m] "r" (psrc_ld_m) \
132
); \
133
\
134
val_ld_m; \
135
} )
136
#else // !(__mips == 64)
137
#define LD(psrc) \
138
( { \
139
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
140
uint32_t val0_ld_m, val1_ld_m; \
141
uint64_t val_ld_m = 0; \
142
\
143
val0_ld_m = LW(psrc_ld_m); \
144
val1_ld_m = LW(psrc_ld_m + 4); \
145
\
146
val_ld_m = (uint64_t) (val1_ld_m); \
147
val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
148
val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
149
\
150
val_ld_m; \
151
} )
152
#endif // (__mips == 64)
153
154
#define SH(val, pdst) \
155
{ \
156
uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
157
uint16_t val_sh_m = (val); \
158
\
159
__asm__ volatile ( \
160
"ush %[val_sh_m], %[pdst_sh_m] \n\t" \
161
\
162
: [pdst_sh_m] "=m" (*pdst_sh_m) \
163
: [val_sh_m] "r" (val_sh_m) \
164
); \
165
}
166
167
#define SW(val, pdst) \
168
{ \
169
uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
170
uint32_t val_sw_m = (val); \
171
\
172
__asm__ volatile ( \
173
"usw %[val_sw_m], %[pdst_sw_m] \n\t" \
174
\
175
: [pdst_sw_m] "=m" (*pdst_sw_m) \
176
: [val_sw_m] "r" (val_sw_m) \
177
); \
178
}
179
180
#define SD(val, pdst) \
181
{ \
182
uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
183
uint32_t val0_sd_m, val1_sd_m; \
184
\
185
val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
186
val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
187
\
188
SW(val0_sd_m, pdst_sd_m); \
189
SW(val1_sd_m, pdst_sd_m + 4); \
190
}
191
#endif // HAVE_MIPS32R6 || HAVE_MIPS64R6
192
193
/* Description : Load 4 words with stride
194
Arguments : Inputs - psrc (source pointer to load from)
195
- stride
196
Outputs - out0, out1, out2, out3
197
Details : Loads word in 'out0' from (psrc)
198
Loads word in 'out1' from (psrc + stride)
199
Loads word in 'out2' from (psrc + 2 * stride)
200
Loads word in 'out3' from (psrc + 3 * stride)
201
*/
202
#define LW4(psrc, stride, out0, out1, out2, out3) \
203
{ \
204
out0 = LW((psrc)); \
205
out1 = LW((psrc) + stride); \
206
out2 = LW((psrc) + 2 * stride); \
207
out3 = LW((psrc) + 3 * stride); \
208
}
209
210
#define LW2(psrc, stride, out0, out1) \
211
{ \
212
out0 = LW((psrc)); \
213
out1 = LW((psrc) + stride); \
214
}
215
216
/* Description : Load double words with stride
217
Arguments : Inputs - psrc (source pointer to load from)
218
- stride
219
Outputs - out0, out1
220
Details : Loads double word in 'out0' from (psrc)
221
Loads double word in 'out1' from (psrc + stride)
222
*/
223
#define LD2(psrc, stride, out0, out1) \
224
{ \
225
out0 = LD((psrc)); \
226
out1 = LD((psrc) + stride); \
227
}
228
#define LD4(psrc, stride, out0, out1, out2, out3) \
229
{ \
230
LD2((psrc), stride, out0, out1); \
231
LD2((psrc) + 2 * stride, stride, out2, out3); \
232
}
233
234
/* Description : Store 4 words with stride
235
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
236
Details : Stores word from 'in0' to (pdst)
237
Stores word from 'in1' to (pdst + stride)
238
Stores word from 'in2' to (pdst + 2 * stride)
239
Stores word from 'in3' to (pdst + 3 * stride)
240
*/
241
#define SW4(in0, in1, in2, in3, pdst, stride) \
242
{ \
243
SW(in0, (pdst)) \
244
SW(in1, (pdst) + stride); \
245
SW(in2, (pdst) + 2 * stride); \
246
SW(in3, (pdst) + 3 * stride); \
247
}
248
249
/* Description : Store 4 double words with stride
250
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
251
Details : Stores double word from 'in0' to (pdst)
252
Stores double word from 'in1' to (pdst + stride)
253
Stores double word from 'in2' to (pdst + 2 * stride)
254
Stores double word from 'in3' to (pdst + 3 * stride)
255
*/
256
#define SD4(in0, in1, in2, in3, pdst, stride) \
257
{ \
258
SD(in0, (pdst)) \
259
SD(in1, (pdst) + stride); \
260
SD(in2, (pdst) + 2 * stride); \
261
SD(in3, (pdst) + 3 * stride); \
262
}
263
264
/* Description : Load vector elements with stride
265
Arguments : Inputs - psrc (source pointer to load from)
266
- stride
267
Outputs - out0, out1
268
Return Type - as per RTYPE
269
Details : Loads elements in 'out0' from (psrc)
270
Loads elements in 'out1' from (psrc + stride)
271
*/
272
#define LD_V2(RTYPE, psrc, stride, out0, out1) \
273
{ \
274
out0 = LD_V(RTYPE, (psrc)); \
275
out1 = LD_V(RTYPE, (psrc) + stride); \
276
}
277
#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
278
#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
279
#define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
280
#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
281
#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
282
283
#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
284
{ \
285
LD_V2(RTYPE, (psrc), stride, out0, out1); \
286
out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
287
}
288
#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
289
#define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
290
291
#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
292
{ \
293
LD_V2(RTYPE, (psrc), stride, out0, out1); \
294
LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
295
}
296
#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
297
#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
298
#define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
299
#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
300
#define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
301
302
#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
303
{ \
304
LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
305
out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
306
}
307
#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
308
#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
309
310
#define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
311
{ \
312
LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
313
LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
314
}
315
#define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
316
#define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
317
#define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
318
#define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
319
320
#define LD_V7(RTYPE, psrc, stride, \
321
out0, out1, out2, out3, out4, out5, out6) \
322
{ \
323
LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
324
LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
325
}
326
#define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
327
#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
328
329
#define LD_V8(RTYPE, psrc, stride, \
330
out0, out1, out2, out3, out4, out5, out6, out7) \
331
{ \
332
LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
333
LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
334
}
335
#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
336
#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
337
#define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
338
#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
339
#define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
340
341
#define LD_V16(RTYPE, psrc, stride, \
342
out0, out1, out2, out3, out4, out5, out6, out7, \
343
out8, out9, out10, out11, out12, out13, out14, out15) \
344
{ \
345
LD_V8(RTYPE, (psrc), stride, \
346
out0, out1, out2, out3, out4, out5, out6, out7); \
347
LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
348
out8, out9, out10, out11, out12, out13, out14, out15); \
349
}
350
#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
351
352
/* Description : Store vectors with stride
353
Arguments : Inputs - in0, in1, stride
354
Outputs - pdst (destination pointer to store to)
355
Details : Stores elements from 'in0' to (pdst)
356
Stores elements from 'in1' to (pdst + stride)
357
*/
358
#define ST_V2(RTYPE, in0, in1, pdst, stride) \
359
{ \
360
ST_V(RTYPE, in0, (pdst)); \
361
ST_V(RTYPE, in1, (pdst) + stride); \
362
}
363
#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
364
#define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
365
#define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
366
#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
367
#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
368
369
#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
370
{ \
371
ST_V2(RTYPE, in0, in1, (pdst), stride); \
372
ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
373
}
374
#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
375
#define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
376
#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
377
#define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
378
379
#define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
380
{ \
381
ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
382
ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
383
}
384
#define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
385
386
#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
387
{ \
388
ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
389
ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
390
}
391
#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
392
#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
393
#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
394
395
/* Description : Store half word elements of vector with stride
396
* Arguments : Inputs - in source vector
397
* - pdst (destination pointer to store to)
398
* - stride
399
* Details : Stores half word 'idx0' from 'in' to (pdst)
400
* Stores half word 'idx1' from 'in' to (pdst + stride)
401
* Similar for other elements
402
*/
403
#define ST_H1(in, idx, pdst) \
404
{ \
405
uint16_t out0_m; \
406
out0_m = __msa_copy_u_h((v8i16) in, idx); \
407
SH(out0_m, (pdst)); \
408
}
409
#define ST_H2(in, idx0, idx1, pdst, stride) \
410
{ \
411
uint16_t out0_m, out1_m; \
412
out0_m = __msa_copy_u_h((v8i16) in, idx0); \
413
out1_m = __msa_copy_u_h((v8i16) in, idx1); \
414
SH(out0_m, (pdst)); \
415
SH(out1_m, (pdst) + stride); \
416
}
417
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
418
{ \
419
uint16_t out0_m, out1_m, out2_m, out3_m; \
420
out0_m = __msa_copy_u_h((v8i16) in, idx0); \
421
out1_m = __msa_copy_u_h((v8i16) in, idx1); \
422
out2_m = __msa_copy_u_h((v8i16) in, idx2); \
423
out3_m = __msa_copy_u_h((v8i16) in, idx3); \
424
SH(out0_m, (pdst)); \
425
SH(out1_m, (pdst) + stride); \
426
SH(out2_m, (pdst) + 2 * stride); \
427
SH(out3_m, (pdst) + 3 * stride); \
428
}
429
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \
430
idx6, idx7, pdst, stride) \
431
{ \
432
ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
433
ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
434
}
435
436
/* Description : Store word elements of vector with stride
437
* Arguments : Inputs - in source vector
438
* - pdst (destination pointer to store to)
439
* - stride
440
* Details : Stores word 'idx0' from 'in' to (pdst)
441
* Stores word 'idx1' from 'in' to (pdst + stride)
442
* Similar for other elements
443
*/
444
#define ST_W1(in, idx, pdst) \
445
{ \
446
uint32_t out0_m; \
447
out0_m = __msa_copy_u_w((v4i32) in, idx); \
448
SW(out0_m, (pdst)); \
449
}
450
#define ST_W2(in, idx0, idx1, pdst, stride) \
451
{ \
452
uint32_t out0_m, out1_m; \
453
out0_m = __msa_copy_u_w((v4i32) in, idx0); \
454
out1_m = __msa_copy_u_w((v4i32) in, idx1); \
455
SW(out0_m, (pdst)); \
456
SW(out1_m, (pdst) + stride); \
457
}
458
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \
459
{ \
460
uint32_t out0_m, out1_m, out2_m, out3_m; \
461
out0_m = __msa_copy_u_w((v4i32) in, idx0); \
462
out1_m = __msa_copy_u_w((v4i32) in, idx1); \
463
out2_m = __msa_copy_u_w((v4i32) in, idx2); \
464
out3_m = __msa_copy_u_w((v4i32) in, idx3); \
465
SW(out0_m, (pdst)); \
466
SW(out1_m, (pdst) + stride); \
467
SW(out2_m, (pdst) + 2*stride); \
468
SW(out3_m, (pdst) + 3*stride); \
469
}
470
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \
471
idx4, idx5, idx6, idx7, pdst, stride) \
472
{ \
473
ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \
474
ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
475
}
476
477
/* Description : Store double word elements of vector with stride
478
* Arguments : Inputs - in source vector
479
* - pdst (destination pointer to store to)
480
* - stride
481
* Details : Stores double word 'idx0' from 'in' to (pdst)
482
* Stores double word 'idx1' from 'in' to (pdst + stride)
483
* Similar for other elements
484
*/
485
#define ST_D1(in, idx, pdst) \
486
{ \
487
uint64_t out0_m; \
488
out0_m = __msa_copy_u_d((v2i64) in, idx); \
489
SD(out0_m, (pdst)); \
490
}
491
#define ST_D2(in, idx0, idx1, pdst, stride) \
492
{ \
493
uint64_t out0_m, out1_m; \
494
out0_m = __msa_copy_u_d((v2i64) in, idx0); \
495
out1_m = __msa_copy_u_d((v2i64) in, idx1); \
496
SD(out0_m, (pdst)); \
497
SD(out1_m, (pdst) + stride); \
498
}
499
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
500
{ \
501
uint64_t out0_m, out1_m, out2_m, out3_m; \
502
out0_m = __msa_copy_u_d((v2i64) in0, idx0); \
503
out1_m = __msa_copy_u_d((v2i64) in0, idx1); \
504
out2_m = __msa_copy_u_d((v2i64) in1, idx2); \
505
out3_m = __msa_copy_u_d((v2i64) in1, idx3); \
506
SD(out0_m, (pdst)); \
507
SD(out1_m, (pdst) + stride); \
508
SD(out2_m, (pdst) + 2 * stride); \
509
SD(out3_m, (pdst) + 3 * stride); \
510
}
511
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \
512
idx4, idx5, idx6, idx7, pdst, stride) \
513
{ \
514
ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
515
ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
516
}
517
518
/* Description : Store as 12x8 byte block to destination memory from
519
input vectors
520
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
521
Details : Index 0 double word element from input vector 'in0' is copied
522
and stored to destination memory at (pblk_12x8_m) followed by
523
index 2 word element from same input vector 'in0' at
524
(pblk_12x8_m + 8)
525
Similar to remaining lines
526
*/
527
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
528
{ \
529
uint64_t out0_m, out1_m, out2_m, out3_m; \
530
uint64_t out4_m, out5_m, out6_m, out7_m; \
531
uint32_t out8_m, out9_m, out10_m, out11_m; \
532
uint32_t out12_m, out13_m, out14_m, out15_m; \
533
uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
534
\
535
out0_m = __msa_copy_u_d((v2i64) in0, 0); \
536
out1_m = __msa_copy_u_d((v2i64) in1, 0); \
537
out2_m = __msa_copy_u_d((v2i64) in2, 0); \
538
out3_m = __msa_copy_u_d((v2i64) in3, 0); \
539
out4_m = __msa_copy_u_d((v2i64) in4, 0); \
540
out5_m = __msa_copy_u_d((v2i64) in5, 0); \
541
out6_m = __msa_copy_u_d((v2i64) in6, 0); \
542
out7_m = __msa_copy_u_d((v2i64) in7, 0); \
543
\
544
out8_m = __msa_copy_u_w((v4i32) in0, 2); \
545
out9_m = __msa_copy_u_w((v4i32) in1, 2); \
546
out10_m = __msa_copy_u_w((v4i32) in2, 2); \
547
out11_m = __msa_copy_u_w((v4i32) in3, 2); \
548
out12_m = __msa_copy_u_w((v4i32) in4, 2); \
549
out13_m = __msa_copy_u_w((v4i32) in5, 2); \
550
out14_m = __msa_copy_u_w((v4i32) in6, 2); \
551
out15_m = __msa_copy_u_w((v4i32) in7, 2); \
552
\
553
SD(out0_m, pblk_12x8_m); \
554
SW(out8_m, pblk_12x8_m + 8); \
555
pblk_12x8_m += stride; \
556
SD(out1_m, pblk_12x8_m); \
557
SW(out9_m, pblk_12x8_m + 8); \
558
pblk_12x8_m += stride; \
559
SD(out2_m, pblk_12x8_m); \
560
SW(out10_m, pblk_12x8_m + 8); \
561
pblk_12x8_m += stride; \
562
SD(out3_m, pblk_12x8_m); \
563
SW(out11_m, pblk_12x8_m + 8); \
564
pblk_12x8_m += stride; \
565
SD(out4_m, pblk_12x8_m); \
566
SW(out12_m, pblk_12x8_m + 8); \
567
pblk_12x8_m += stride; \
568
SD(out5_m, pblk_12x8_m); \
569
SW(out13_m, pblk_12x8_m + 8); \
570
pblk_12x8_m += stride; \
571
SD(out6_m, pblk_12x8_m); \
572
SW(out14_m, pblk_12x8_m + 8); \
573
pblk_12x8_m += stride; \
574
SD(out7_m, pblk_12x8_m); \
575
SW(out15_m, pblk_12x8_m + 8); \
576
}
577
578
/* Description : average with rounding (in0 + in1 + 1) / 2.
579
Arguments : Inputs - in0, in1, in2, in3,
580
Outputs - out0, out1
581
Return Type - as per RTYPE
582
Details : Each byte element from 'in0' vector is added with each byte
583
element from 'in1' vector. The addition of the elements plus 1
584
(for rounding) is done unsigned with full precision,
585
i.e. the result has one extra bit. Unsigned division by 2
586
(or logical shift right by one bit) is performed before writing
587
the result to vector 'out0'
588
Similar for the pair of 'in2' and 'in3'
589
*/
590
#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
591
{ \
592
out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
593
out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
594
}
595
#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
596
597
#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
598
out0, out1, out2, out3) \
599
{ \
600
AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
601
AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
602
}
603
#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
604
605
/* Description : Immediate number of columns to slide
606
Arguments : Inputs - s, d, slide_val
607
Outputs - out
608
Return Type - as per RTYPE
609
Details : Byte elements from 'd' vector are slide into 's' by
610
number of elements specified by 'slide_val'
611
*/
612
#define SLDI_B(RTYPE, d, s, slide_val, out) \
613
{ \
614
out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \
615
}
616
617
#define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
618
{ \
619
SLDI_B(RTYPE, d0, s0, slide_val, out0) \
620
SLDI_B(RTYPE, d1, s1, slide_val, out1) \
621
}
622
#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
623
#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
624
#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
625
#define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
626
627
#define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \
628
out0, out1, out2) \
629
{ \
630
SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
631
SLDI_B(RTYPE, d2, s2, slide_val, out2) \
632
}
633
#define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
634
#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
635
#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
636
637
#define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \
638
slide_val, out0, out1, out2, out3) \
639
{ \
640
SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
641
SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \
642
}
643
#define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
644
#define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
645
#define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
646
647
/* Description : Shuffle byte vector elements as per mask vector
648
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
649
Outputs - out0, out1
650
Return Type - as per RTYPE
651
Details : Selective byte elements from in0 & in1 are copied to out0 as
652
per control vector mask0
653
Selective byte elements from in2 & in3 are copied to out1 as
654
per control vector mask1
655
*/
656
#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
657
{ \
658
out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
659
out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
660
}
661
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
662
#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
663
#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
664
#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
665
666
#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
667
out0, out1, out2) \
668
{ \
669
VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
670
out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
671
}
672
#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
673
674
#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
675
out0, out1, out2, out3) \
676
{ \
677
VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
678
VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
679
}
680
#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
681
#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
682
683
/* Description : Shuffle halfword vector elements as per mask vector
684
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
685
Outputs - out0, out1
686
Return Type - as per RTYPE
687
Details : Selective halfword elements from in0 & in1 are copied to out0
688
as per control vector mask0
689
Selective halfword elements from in2 & in3 are copied to out1
690
as per control vector mask1
691
*/
692
#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
693
{ \
694
out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
695
out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
696
}
697
#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
698
699
#define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
700
out0, out1, out2) \
701
{ \
702
VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
703
out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
704
}
705
#define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
706
707
/* Description : Shuffle byte vector elements as per mask vector
708
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
709
Outputs - out0, out1
710
Return Type - as per RTYPE
711
Details : Selective byte elements from in0 & in1 are copied to out0 as
712
per control vector mask0
713
Selective byte elements from in2 & in3 are copied to out1 as
714
per control vector mask1
715
*/
716
#define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
717
{ \
718
out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
719
out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
720
}
721
#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
722
723
/* Description : Dot product of byte vector elements
724
Arguments : Inputs - mult0, mult1
725
cnst0, cnst1
726
Outputs - out0, out1
727
Return Type - as per RTYPE
728
Details : Unsigned byte elements from mult0 are multiplied with
729
unsigned byte elements from cnst0 producing a result
730
twice the size of input i.e. unsigned halfword.
731
Then this multiplication results of adjacent odd-even elements
732
are added together and stored to the out vector
733
(2 unsigned halfword results)
734
*/
735
#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
736
{ \
737
out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
738
out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
739
}
740
#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
741
742
#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
743
cnst0, cnst1, cnst2, cnst3, \
744
out0, out1, out2, out3) \
745
{ \
746
DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
747
DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
748
}
749
#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
750
751
/* Description : Dot product of byte vector elements
752
Arguments : Inputs - mult0, mult1
753
cnst0, cnst1
754
Outputs - out0, out1
755
Return Type - as per RTYPE
756
Details : Signed byte elements from mult0 are multiplied with
757
signed byte elements from cnst0 producing a result
758
twice the size of input i.e. signed halfword.
759
Then this multiplication results of adjacent odd-even elements
760
are added together and stored to the out vector
761
(2 signed halfword results)
762
*/
763
#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
764
{ \
765
out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
766
out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
767
}
768
#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
769
770
#define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
771
out0, out1, out2) \
772
{ \
773
DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
774
out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
775
}
776
#define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
777
778
#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
779
cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
780
{ \
781
DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
782
DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
783
}
784
#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
785
786
/* Description : Dot product of halfword vector elements
787
Arguments : Inputs - mult0, mult1
788
cnst0, cnst1
789
Outputs - out0, out1
790
Return Type - as per RTYPE
791
Details : Signed halfword elements from mult0 are multiplied with
792
signed halfword elements from cnst0 producing a result
793
twice the size of input i.e. signed word.
794
Then this multiplication results of adjacent odd-even elements
795
are added together and stored to the out vector
796
(2 signed word results)
797
*/
798
#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
799
{ \
800
out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
801
out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
802
}
803
#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
804
805
#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
806
cnst0, cnst1, cnst2, cnst3, \
807
out0, out1, out2, out3) \
808
{ \
809
DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
810
DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
811
}
812
#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
813
814
/* Description : Dot product & addition of byte vector elements
815
Arguments : Inputs - mult0, mult1
816
cnst0, cnst1
817
Outputs - out0, out1
818
Return Type - as per RTYPE
819
Details : Signed byte elements from mult0 are multiplied with
820
signed byte elements from cnst0 producing a result
821
twice the size of input i.e. signed halfword.
822
Then this multiplication results of adjacent odd-even elements
823
are added to the out vector
824
(2 signed halfword results)
825
*/
826
#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
827
{ \
828
out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
829
(v16i8) mult0, (v16i8) cnst0); \
830
out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
831
(v16i8) mult1, (v16i8) cnst1); \
832
}
833
#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
834
835
#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
836
cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
837
{ \
838
DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
839
DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
840
}
841
#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
842
843
/* Description : Dot product & addition of byte vector elements
844
Arguments : Inputs - mult0, mult1
845
cnst0, cnst1
846
Outputs - out0, out1
847
Return Type - as per RTYPE
848
Details : Unsigned byte elements from mult0 are multiplied with
849
unsigned byte elements from cnst0 producing a result
850
twice the size of input i.e. unsigned halfword.
851
Then this multiplication results of adjacent odd-even elements
852
are added to the out vector
853
(2 unsigned halfword results)
854
*/
855
#define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
856
{ \
857
out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
858
(v16u8) mult0, (v16u8) cnst0); \
859
out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
860
(v16u8) mult1, (v16u8) cnst1); \
861
}
862
#define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
863
864
/* Description : Dot product & addition of halfword vector elements
865
Arguments : Inputs - mult0, mult1
866
cnst0, cnst1
867
Outputs - out0, out1
868
Return Type - as per RTYPE
869
Details : Signed halfword elements from mult0 are multiplied with
870
signed halfword elements from cnst0 producing a result
871
twice the size of input i.e. signed word.
872
Then this multiplication results of adjacent odd-even elements
873
are added to the out vector
874
(2 signed word results)
875
*/
876
#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
877
{ \
878
out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
879
(v8i16) mult0, (v8i16) cnst0); \
880
out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
881
(v8i16) mult1, (v8i16) cnst1); \
882
}
883
#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
884
885
#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
886
cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
887
{ \
888
DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
889
DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
890
}
891
#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
892
893
/* Description : Minimum values between unsigned elements of
894
either vector are copied to the output vector
895
Arguments : Inputs - in0, in1, min_vec
896
Outputs - in0, in1, (in place)
897
Return Type - as per RTYPE
898
Details : Minimum of unsigned halfword element values from 'in0' and
899
'min_value' are written to output vector 'in0'
900
*/
901
#define MIN_UH2(RTYPE, in0, in1, min_vec) \
902
{ \
903
in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
904
in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
905
}
906
#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
907
908
#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
909
{ \
910
MIN_UH2(RTYPE, in0, in1, min_vec); \
911
MIN_UH2(RTYPE, in2, in3, min_vec); \
912
}
913
#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
914
915
/* Description : Clips all halfword elements of input vector between min & max
916
out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
917
Arguments : Inputs - in (input vector)
918
- min (min threshold)
919
- max (max threshold)
920
Outputs - in (output vector with clipped elements)
921
Return Type - signed halfword
922
*/
923
#define CLIP_SH(in, min, max) \
924
{ \
925
in = __msa_max_s_h((v8i16) min, (v8i16) in); \
926
in = __msa_min_s_h((v8i16) max, (v8i16) in); \
927
}
928
929
/* Description : Clips all signed halfword elements of input vector
930
between 0 & 255
931
Arguments : Inputs - in (input vector)
932
Outputs - in (output vector with clipped elements)
933
Return Type - signed halfwords
934
*/
935
#define CLIP_SH_0_255(in) \
936
{ \
937
in = __msa_maxi_s_h((v8i16) in, 0); \
938
in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \
939
}
940
941
#define CLIP_SH2_0_255(in0, in1) \
942
{ \
943
CLIP_SH_0_255(in0); \
944
CLIP_SH_0_255(in1); \
945
}
946
947
#define CLIP_SH4_0_255(in0, in1, in2, in3) \
948
{ \
949
CLIP_SH2_0_255(in0, in1); \
950
CLIP_SH2_0_255(in2, in3); \
951
}
952
953
#define CLIP_SH8_0_255(in0, in1, in2, in3, \
954
in4, in5, in6, in7) \
955
{ \
956
CLIP_SH4_0_255(in0, in1, in2, in3); \
957
CLIP_SH4_0_255(in4, in5, in6, in7); \
958
}
959
960
/* Description : Clips all signed word elements of input vector
961
between 0 & 255
962
Arguments : Inputs - in (input vector)
963
Outputs - in (output vector with clipped elements)
964
Return Type - signed word
965
*/
966
#define CLIP_SW_0_255(in) \
967
{ \
968
in = __msa_maxi_s_w((v4i32) in, 0); \
969
in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \
970
}
971
972
#define CLIP_SW2_0_255(in0, in1) \
973
{ \
974
CLIP_SW_0_255(in0); \
975
CLIP_SW_0_255(in1); \
976
}
977
978
#define CLIP_SW4_0_255(in0, in1, in2, in3) \
979
{ \
980
CLIP_SW2_0_255(in0, in1); \
981
CLIP_SW2_0_255(in2, in3); \
982
}
983
984
#define CLIP_SW8_0_255(in0, in1, in2, in3, \
985
in4, in5, in6, in7) \
986
{ \
987
CLIP_SW4_0_255(in0, in1, in2, in3); \
988
CLIP_SW4_0_255(in4, in5, in6, in7); \
989
}
990
991
/* Description : Addition of 4 signed word elements
992
4 signed word elements of input vector are added together and
993
resulted integer sum is returned
994
Arguments : Inputs - in (signed word vector)
995
Outputs - sum_m (i32 sum)
996
Return Type - signed word
997
*/
998
#define HADD_SW_S32(in) \
999
( { \
1000
v2i64 res0_m, res1_m; \
1001
int32_t sum_m; \
1002
\
1003
res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1004
res1_m = __msa_splati_d(res0_m, 1); \
1005
res0_m += res1_m; \
1006
sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1007
sum_m; \
1008
} )
1009
1010
/* Description : Addition of 8 unsigned halfword elements
1011
8 unsigned halfword elements of input vector are added
1012
together and resulted integer sum is returned
1013
Arguments : Inputs - in (unsigned halfword vector)
1014
Outputs - sum_m (u32 sum)
1015
Return Type - unsigned word
1016
*/
1017
#define HADD_UH_U32(in) \
1018
( { \
1019
v4u32 res_m; \
1020
v2u64 res0_m, res1_m; \
1021
uint32_t sum_m; \
1022
\
1023
res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1024
res0_m = __msa_hadd_u_d(res_m, res_m); \
1025
res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1026
res0_m += res1_m; \
1027
sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1028
sum_m; \
1029
} )
1030
1031
/* Description : Horizontal addition of signed byte vector elements
1032
Arguments : Inputs - in0, in1
1033
Outputs - out0, out1
1034
Return Type - as per RTYPE
1035
Details : Each signed odd byte element from 'in0' is added to
1036
even signed byte element from 'in0' (pairwise) and the
1037
halfword result is stored in 'out0'
1038
*/
1039
#define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1040
{ \
1041
out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1042
out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1043
}
1044
#define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1045
1046
#define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1047
{ \
1048
HADD_SB2(RTYPE, in0, in1, out0, out1); \
1049
HADD_SB2(RTYPE, in2, in3, out2, out3); \
1050
}
1051
#define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1052
#define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1053
1054
/* Description : Horizontal addition of unsigned byte vector elements
1055
Arguments : Inputs - in0, in1
1056
Outputs - out0, out1
1057
Return Type - as per RTYPE
1058
Details : Each unsigned odd byte element from 'in0' is added to
1059
even unsigned byte element from 'in0' (pairwise) and the
1060
halfword result is stored in 'out0'
1061
*/
1062
#define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1063
{ \
1064
out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1065
out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1066
}
1067
#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1068
1069
#define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1070
{ \
1071
HADD_UB2(RTYPE, in0, in1, out0, out1); \
1072
out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1073
}
1074
#define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1075
1076
#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1077
{ \
1078
HADD_UB2(RTYPE, in0, in1, out0, out1); \
1079
HADD_UB2(RTYPE, in2, in3, out2, out3); \
1080
}
1081
#define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1082
#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1083
#define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1084
1085
/* Description : Horizontal subtraction of unsigned byte vector elements
1086
Arguments : Inputs - in0, in1
1087
Outputs - out0, out1
1088
Return Type - as per RTYPE
1089
Details : Each unsigned odd byte element from 'in0' is subtracted from
1090
even unsigned byte element from 'in0' (pairwise) and the
1091
halfword result is stored in 'out0'
1092
*/
1093
#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1094
{ \
1095
out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1096
out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1097
}
1098
#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1099
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1100
1101
#define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1102
{ \
1103
HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1104
HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1105
}
1106
#define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1107
#define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1108
1109
/* Description : SAD (Sum of Absolute Difference)
1110
Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1111
Outputs - sad_m (halfword vector with sad)
1112
Return Type - unsigned halfword
1113
Details : Absolute difference of all the byte elements from 'in0' with
1114
'ref0' is calculated and preserved in 'diff0'. From the 16
1115
unsigned absolute diff values, even-odd pairs are added
1116
together to generate 8 halfword results.
1117
*/
1118
#define SAD_UB2_UH(in0, in1, ref0, ref1) \
1119
( { \
1120
v16u8 diff0_m, diff1_m; \
1121
v8u16 sad_m = { 0 }; \
1122
\
1123
diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1124
diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1125
\
1126
sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1127
sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1128
\
1129
sad_m; \
1130
} )
1131
1132
/* Description : Insert specified word elements from input vectors to 1
1133
destination vector
1134
Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1135
Outputs - out (output vector)
1136
Return Type - as per RTYPE
1137
*/
1138
#define INSERT_W2(RTYPE, in0, in1, out) \
1139
{ \
1140
out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1141
out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1142
}
1143
#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1144
#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1145
1146
#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1147
{ \
1148
out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1149
out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1150
out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1151
out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1152
}
1153
#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1154
#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1155
#define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1156
#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1157
1158
/* Description : Insert specified double word elements from input vectors to 1
1159
destination vector
1160
Arguments : Inputs - in0, in1 (2 input vectors)
1161
Outputs - out (output vector)
1162
Return Type - as per RTYPE
1163
*/
1164
#define INSERT_D2(RTYPE, in0, in1, out) \
1165
{ \
1166
out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1167
out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1168
}
1169
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1170
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1171
#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1172
#define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1173
1174
/* Description : Interleave even byte elements from vectors
1175
Arguments : Inputs - in0, in1, in2, in3
1176
Outputs - out0, out1
1177
Return Type - as per RTYPE
1178
Details : Even byte elements of 'in0' and even byte
1179
elements of 'in1' are interleaved and copied to 'out0'
1180
Even byte elements of 'in2' and even byte
1181
elements of 'in3' are interleaved and copied to 'out1'
1182
*/
1183
#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1184
{ \
1185
out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1186
out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1187
}
1188
#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1189
#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1190
#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1191
#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1192
1193
/* Description : Interleave even halfword elements from vectors
1194
Arguments : Inputs - in0, in1, in2, in3
1195
Outputs - out0, out1
1196
Return Type - as per RTYPE
1197
Details : Even halfword elements of 'in0' and even halfword
1198
elements of 'in1' are interleaved and copied to 'out0'
1199
Even halfword elements of 'in2' and even halfword
1200
elements of 'in3' are interleaved and copied to 'out1'
1201
*/
1202
#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1203
{ \
1204
out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1205
out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1206
}
1207
#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1208
#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1209
#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1210
1211
/* Description : Interleave even word elements from vectors
1212
Arguments : Inputs - in0, in1, in2, in3
1213
Outputs - out0, out1
1214
Return Type - as per RTYPE
1215
Details : Even word elements of 'in0' and even word
1216
elements of 'in1' are interleaved and copied to 'out0'
1217
Even word elements of 'in2' and even word
1218
elements of 'in3' are interleaved and copied to 'out1'
1219
*/
1220
#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1221
{ \
1222
out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1223
out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1224
}
1225
#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1226
#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1227
#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1228
#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1229
1230
/* Description : Interleave even double word elements from vectors
1231
Arguments : Inputs - in0, in1, in2, in3
1232
Outputs - out0, out1
1233
Return Type - as per RTYPE
1234
Details : Even double word elements of 'in0' and even double word
1235
elements of 'in1' are interleaved and copied to 'out0'
1236
Even double word elements of 'in2' and even double word
1237
elements of 'in3' are interleaved and copied to 'out1'
1238
*/
1239
#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1240
{ \
1241
out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1242
out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1243
}
1244
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1245
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1246
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1247
1248
/* Description : Interleave left half of byte elements from vectors
1249
Arguments : Inputs - in0, in1, in2, in3
1250
Outputs - out0, out1
1251
Return Type - as per RTYPE
1252
Details : Left half of byte elements of in0 and left half of byte
1253
elements of in1 are interleaved and copied to out0.
1254
Left half of byte elements of in2 and left half of byte
1255
elements of in3 are interleaved and copied to out1.
1256
*/
1257
#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1258
{ \
1259
out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1260
out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1261
}
1262
#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1263
#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1264
#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1265
#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1266
1267
#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1268
out0, out1, out2, out3) \
1269
{ \
1270
ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1271
ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1272
}
1273
#define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1274
#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1275
#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1276
#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1277
1278
/* Description : Interleave left half of halfword elements from vectors
1279
Arguments : Inputs - in0, in1, in2, in3
1280
Outputs - out0, out1
1281
Return Type - as per RTYPE
1282
Details : Left half of halfword elements of in0 and left half of halfword
1283
elements of in1 are interleaved and copied to out0.
1284
Left half of halfword elements of in2 and left half of halfword
1285
elements of in3 are interleaved and copied to out1.
1286
*/
1287
#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1288
{ \
1289
out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1290
out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1291
}
1292
#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1293
#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1294
1295
#define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1296
out0, out1, out2, out3) \
1297
{ \
1298
ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1299
ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1300
}
1301
#define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1302
#define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1303
1304
/* Description : Interleave left half of word elements from vectors
1305
Arguments : Inputs - in0, in1, in2, in3
1306
Outputs - out0, out1
1307
Return Type - as per RTYPE
1308
Details : Left half of word elements of in0 and left half of word
1309
elements of in1 are interleaved and copied to out0.
1310
Left half of word elements of in2 and left half of word
1311
elements of in3 are interleaved and copied to out1.
1312
*/
1313
#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1314
{ \
1315
out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1316
out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1317
}
1318
#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1319
#define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1320
#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1321
1322
/* Description : Interleave right half of byte elements from vectors
1323
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1324
Outputs - out0, out1, out2, out3
1325
Return Type - as per RTYPE
1326
Details : Right half of byte elements of in0 and right half of byte
1327
elements of in1 are interleaved and copied to out0.
1328
Right half of byte elements of in2 and right half of byte
1329
elements of in3 are interleaved and copied to out1.
1330
Similar for other pairs
1331
*/
1332
#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1333
{ \
1334
out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1335
out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1336
}
1337
#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1338
#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1339
#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1340
#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1341
#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1342
1343
#define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1344
{ \
1345
ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1346
out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1347
}
1348
#define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1349
#define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1350
#define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1351
#define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1352
1353
#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1354
out0, out1, out2, out3) \
1355
{ \
1356
ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1357
ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1358
}
1359
#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1360
#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1361
#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1362
#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1363
#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1364
1365
#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1366
in8, in9, in10, in11, in12, in13, in14, in15, \
1367
out0, out1, out2, out3, out4, out5, out6, out7) \
1368
{ \
1369
ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1370
out0, out1, out2, out3); \
1371
ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1372
out4, out5, out6, out7); \
1373
}
1374
#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1375
#define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1376
1377
/* Description : Interleave right half of halfword elements from vectors
1378
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1379
Outputs - out0, out1, out2, out3
1380
Return Type - as per RTYPE
1381
Details : Right half of halfword elements of in0 and right half of
1382
halfword elements of in1 are interleaved and copied to out0.
1383
Right half of halfword elements of in2 and right half of
1384
halfword elements of in3 are interleaved and copied to out1.
1385
Similar for other pairs
1386
*/
1387
#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1388
{ \
1389
out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1390
out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1391
}
1392
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1393
#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1394
1395
#define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1396
{ \
1397
ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1398
out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1399
}
1400
#define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1401
1402
#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1403
out0, out1, out2, out3) \
1404
{ \
1405
ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1406
ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1407
}
1408
#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1409
#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1410
1411
#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1412
{ \
1413
out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1414
out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1415
}
1416
#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1417
#define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1418
#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1419
1420
#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1421
out0, out1, out2, out3) \
1422
{ \
1423
ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1424
ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1425
}
1426
#define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1427
#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1428
1429
/* Description : Interleave right half of double word elements from vectors
1430
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1431
Outputs - out0, out1, out2, out3
1432
Return Type - as per RTYPE
1433
Details : Right half of double word elements of in0 and right half of
1434
double word elements of in1 are interleaved and copied to out0.
1435
Right half of double word elements of in2 and right half of
1436
double word elements of in3 are interleaved and copied to out1.
1437
*/
1438
#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1439
{ \
1440
out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1441
out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1442
}
1443
#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1444
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1445
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1446
1447
#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1448
{ \
1449
ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1450
out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1451
}
1452
#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1453
1454
#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1455
out0, out1, out2, out3) \
1456
{ \
1457
ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1458
ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1459
}
1460
#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1461
#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1462
1463
/* Description : Interleave left half of double word elements from vectors
1464
Arguments : Inputs - in0, in1, in2, in3
1465
Outputs - out0, out1
1466
Return Type - as per RTYPE
1467
Details : Left half of double word elements of in0 and left half of
1468
double word elements of in1 are interleaved and copied to out0.
1469
Left half of double word elements of in2 and left half of
1470
double word elements of in3 are interleaved and copied to out1.
1471
*/
1472
#define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1473
{ \
1474
out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1475
out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1476
}
1477
#define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1478
#define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1479
#define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1480
1481
/* Description : Interleave both left and right half of input vectors
1482
Arguments : Inputs - in0, in1
1483
Outputs - out0, out1
1484
Return Type - as per RTYPE
1485
Details : Right half of byte elements from 'in0' and 'in1' are
1486
interleaved and stored to 'out0'
1487
Left half of byte elements from 'in0' and 'in1' are
1488
interleaved and stored to 'out1'
1489
*/
1490
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1491
{ \
1492
out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1493
out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1494
}
1495
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1496
#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1497
#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1498
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1499
#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1500
1501
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1502
{ \
1503
out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1504
out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1505
}
1506
#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1507
#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1508
#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1509
#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1510
1511
#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1512
{ \
1513
out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1514
out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1515
}
1516
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1517
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1518
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1519
1520
/* Description : Maximum values between signed elements of vector and
1521
5-bit signed immediate value are copied to the output vector
1522
Arguments : Inputs - in0, in1, in2, in3, max_val
1523
Outputs - in0, in1, in2, in3 (in place)
1524
Return Type - as per RTYPE
1525
Details : Maximum of signed halfword element values from 'in0' and
1526
'max_val' are written to output vector 'in0'
1527
*/
1528
#define MAXI_SH2(RTYPE, in0, in1, max_val) \
1529
{ \
1530
in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1531
in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1532
}
1533
#define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1534
#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1535
1536
#define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1537
{ \
1538
MAXI_SH2(RTYPE, in0, in1, max_val); \
1539
MAXI_SH2(RTYPE, in2, in3, max_val); \
1540
}
1541
#define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1542
#define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1543
1544
#define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1545
{ \
1546
MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1547
MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1548
}
1549
#define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1550
#define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1551
1552
/* Description : Saturate the halfword element values to the max
1553
unsigned value of (sat_val+1 bits)
1554
The element data width remains unchanged
1555
Arguments : Inputs - in0, in1, in2, in3, sat_val
1556
Outputs - in0, in1, in2, in3 (in place)
1557
Return Type - as per RTYPE
1558
Details : Each unsigned halfword element from 'in0' is saturated to the
1559
value generated with (sat_val+1) bit range
1560
Results are in placed to original vectors
1561
*/
1562
#define SAT_UH2(RTYPE, in0, in1, sat_val) \
1563
{ \
1564
in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1565
in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1566
}
1567
#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1568
#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1569
1570
#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1571
{ \
1572
SAT_UH2(RTYPE, in0, in1, sat_val); \
1573
SAT_UH2(RTYPE, in2, in3, sat_val); \
1574
}
1575
#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1576
#define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1577
1578
#define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1579
{ \
1580
SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1581
SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1582
}
1583
#define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1584
#define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1585
1586
/* Description : Saturate the halfword element values to the max
1587
unsigned value of (sat_val+1 bits)
1588
The element data width remains unchanged
1589
Arguments : Inputs - in0, in1, in2, in3, sat_val
1590
Outputs - in0, in1, in2, in3 (in place)
1591
Return Type - as per RTYPE
1592
Details : Each unsigned halfword element from 'in0' is saturated to the
1593
value generated with (sat_val+1) bit range
1594
Results are in placed to original vectors
1595
*/
1596
#define SAT_SH2(RTYPE, in0, in1, sat_val) \
1597
{ \
1598
in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1599
in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1600
}
1601
#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1602
1603
#define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1604
{ \
1605
SAT_SH2(RTYPE, in0, in1, sat_val); \
1606
in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1607
}
1608
#define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1609
1610
#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1611
{ \
1612
SAT_SH2(RTYPE, in0, in1, sat_val); \
1613
SAT_SH2(RTYPE, in2, in3, sat_val); \
1614
}
1615
#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1616
1617
/* Description : Saturate the word element values to the max
1618
unsigned value of (sat_val+1 bits)
1619
The element data width remains unchanged
1620
Arguments : Inputs - in0, in1, in2, in3, sat_val
1621
Outputs - in0, in1, in2, in3 (in place)
1622
Return Type - as per RTYPE
1623
Details : Each unsigned word element from 'in0' is saturated to the
1624
value generated with (sat_val+1) bit range
1625
Results are in placed to original vectors
1626
*/
1627
#define SAT_SW2(RTYPE, in0, in1, sat_val) \
1628
{ \
1629
in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1630
in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1631
}
1632
#define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1633
1634
#define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1635
{ \
1636
SAT_SW2(RTYPE, in0, in1, sat_val); \
1637
SAT_SW2(RTYPE, in2, in3, sat_val); \
1638
}
1639
#define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1640
1641
/* Description : Indexed halfword element values are replicated to all
1642
elements in output vector
1643
Arguments : Inputs - in, idx0, idx1
1644
Outputs - out0, out1
1645
Return Type - as per RTYPE
1646
Details : 'idx0' element value from 'in' vector is replicated to all
1647
elements in 'out0' vector
1648
Valid index range for halfword operation is 0-7
1649
*/
1650
#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1651
{ \
1652
out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1653
out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1654
}
1655
#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1656
#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1657
1658
#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1659
out0, out1, out2) \
1660
{ \
1661
SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1662
out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1663
}
1664
#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1665
#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1666
1667
#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1668
out0, out1, out2, out3) \
1669
{ \
1670
SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1671
SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1672
}
1673
#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1674
#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1675
1676
/* Description : Indexed word element values are replicated to all
1677
elements in output vector
1678
Arguments : Inputs - in, stidx
1679
Outputs - out0, out1
1680
Return Type - as per RTYPE
1681
Details : 'stidx' element value from 'in' vector is replicated to all
1682
elements in 'out0' vector
1683
'stidx + 1' element value from 'in' vector is replicated to all
1684
elements in 'out1' vector
1685
Valid index range for halfword operation is 0-3
1686
*/
1687
#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1688
{ \
1689
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1690
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1691
}
1692
#define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1693
#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1694
1695
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1696
{ \
1697
SPLATI_W2(RTYPE, in, 0, out0, out1); \
1698
SPLATI_W2(RTYPE, in, 2, out2, out3); \
1699
}
1700
#define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1701
#define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1702
1703
/* Description : Pack even byte elements of vector pairs
1704
Arguments : Inputs - in0, in1, in2, in3
1705
Outputs - out0, out1
1706
Return Type - as per RTYPE
1707
Details : Even byte elements of in0 are copied to the left half of
1708
out0 & even byte elements of in1 are copied to the right
1709
half of out0.
1710
Even byte elements of in2 are copied to the left half of
1711
out1 & even byte elements of in3 are copied to the right
1712
half of out1.
1713
*/
1714
#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1715
{ \
1716
out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1717
out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1718
}
1719
#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1720
#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1721
#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1722
#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1723
1724
#define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1725
{ \
1726
PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1727
out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1728
}
1729
#define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1730
#define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1731
1732
#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1733
out0, out1, out2, out3) \
1734
{ \
1735
PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1736
PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1737
}
1738
#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1739
#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1740
#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1741
#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1742
1743
/* Description : Pack even halfword elements of vector pairs
1744
Arguments : Inputs - in0, in1, in2, in3
1745
Outputs - out0, out1
1746
Return Type - as per RTYPE
1747
Details : Even halfword elements of in0 are copied to the left half of
1748
out0 & even halfword elements of in1 are copied to the right
1749
half of out0.
1750
Even halfword elements of in2 are copied to the left half of
1751
out1 & even halfword elements of in3 are copied to the right
1752
half of out1.
1753
*/
1754
#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1755
{ \
1756
out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1757
out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1758
}
1759
#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1760
#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1761
1762
#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1763
out0, out1, out2, out3) \
1764
{ \
1765
PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1766
PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1767
}
1768
#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1769
#define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1770
1771
/* Description : Pack even double word elements of vector pairs
1772
Arguments : Inputs - in0, in1, in2, in3
1773
Outputs - out0, out1
1774
Return Type - as per RTYPE
1775
Details : Even double elements of in0 are copied to the left half of
1776
out0 & even double elements of in1 are copied to the right
1777
half of out0.
1778
Even double elements of in2 are copied to the left half of
1779
out1 & even double elements of in3 are copied to the right
1780
half of out1.
1781
*/
1782
#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1783
{ \
1784
out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1785
out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1786
}
1787
#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1788
#define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1789
#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1790
1791
#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1792
out0, out1, out2, out3) \
1793
{ \
1794
PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1795
PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1796
}
1797
#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1798
1799
/* Description : Pack odd double word elements of vector pairs
1800
Arguments : Inputs - in0, in1
1801
Outputs - out0, out1
1802
Return Type - as per RTYPE
1803
Details : As operation is on same input 'in0' vector, index 1 double word
1804
element is overwritten to index 0 and result is written to out0
1805
As operation is on same input 'in1' vector, index 1 double word
1806
element is overwritten to index 0 and result is written to out1
1807
*/
1808
#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1809
{ \
1810
out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1811
out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1812
}
1813
#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1814
#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1815
#define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1816
1817
/* Description : Each byte element is logically xor'ed with immediate 128
1818
Arguments : Inputs - in0, in1
1819
Outputs - in0, in1 (in-place)
1820
Return Type - as per RTYPE
1821
Details : Each unsigned byte element from input vector 'in0' is
1822
logically xor'ed with 128 and result is in-place stored in
1823
'in0' vector
1824
Each unsigned byte element from input vector 'in1' is
1825
logically xor'ed with 128 and result is in-place stored in
1826
'in1' vector
1827
Similar for other pairs
1828
*/
1829
#define XORI_B2_128(RTYPE, in0, in1) \
1830
{ \
1831
in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1832
in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1833
}
1834
#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1835
#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1836
#define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1837
1838
#define XORI_B3_128(RTYPE, in0, in1, in2) \
1839
{ \
1840
XORI_B2_128(RTYPE, in0, in1); \
1841
in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1842
}
1843
#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1844
1845
#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1846
{ \
1847
XORI_B2_128(RTYPE, in0, in1); \
1848
XORI_B2_128(RTYPE, in2, in3); \
1849
}
1850
#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1851
#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1852
#define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1853
1854
#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1855
{ \
1856
XORI_B3_128(RTYPE, in0, in1, in2); \
1857
XORI_B2_128(RTYPE, in3, in4); \
1858
}
1859
#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1860
1861
#define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1862
{ \
1863
XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1864
XORI_B2_128(RTYPE, in4, in5); \
1865
}
1866
#define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1867
1868
#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1869
{ \
1870
XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1871
XORI_B3_128(RTYPE, in4, in5, in6); \
1872
}
1873
#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1874
1875
#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1876
{ \
1877
XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1878
XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1879
}
1880
#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1881
#define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1882
1883
/* Description : Addition of signed halfword elements and signed saturation
1884
Arguments : Inputs - in0, in1, in2, in3
1885
Outputs - out0, out1
1886
Return Type - as per RTYPE
1887
Details : Signed halfword elements from 'in0' are added to signed
1888
halfword elements of 'in1'. The result is then signed saturated
1889
between -32768 to +32767 (as per halfword data type)
1890
Similar for other pairs
1891
*/
1892
#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1893
{ \
1894
out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1895
out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1896
}
1897
#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1898
1899
#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1900
out0, out1, out2, out3) \
1901
{ \
1902
ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1903
ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1904
}
1905
#define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1906
#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1907
1908
/* Description : Shift left all elements of vector (generic for all data types)
1909
Arguments : Inputs - in0, in1, in2, in3, shift
1910
Outputs - in0, in1, in2, in3 (in place)
1911
Return Type - as per input vector RTYPE
1912
Details : Each element of vector 'in0' is left shifted by 'shift' and
1913
result is in place written to 'in0'
1914
Similar for other pairs
1915
*/
1916
#define SLLI_2V(in0, in1, shift) \
1917
{ \
1918
in0 = in0 << shift; \
1919
in1 = in1 << shift; \
1920
}
1921
#define SLLI_4V(in0, in1, in2, in3, shift) \
1922
{ \
1923
in0 = in0 << shift; \
1924
in1 = in1 << shift; \
1925
in2 = in2 << shift; \
1926
in3 = in3 << shift; \
1927
}
1928
1929
/* Description : Arithmetic shift right all elements of vector
1930
(generic for all data types)
1931
Arguments : Inputs - in0, in1, in2, in3, shift
1932
Outputs - in0, in1, in2, in3 (in place)
1933
Return Type - as per input vector RTYPE
1934
Details : Each element of vector 'in0' is right shifted by 'shift' and
1935
result is in place written to 'in0'
1936
Here, 'shift' is GP variable passed in
1937
Similar for other pairs
1938
*/
1939
#define SRA_4V(in0, in1, in2, in3, shift) \
1940
{ \
1941
in0 = in0 >> shift; \
1942
in1 = in1 >> shift; \
1943
in2 = in2 >> shift; \
1944
in3 = in3 >> shift; \
1945
}
1946
1947
/* Description : Shift right logical all halfword elements of vector
1948
Arguments : Inputs - in0, in1, in2, in3, shift
1949
Outputs - in0, in1, in2, in3 (in place)
1950
Return Type - as per RTYPE
1951
Details : Each element of vector 'in0' is shifted right logical by
1952
number of bits respective element holds in vector 'shift' and
1953
result is in place written to 'in0'
1954
Here, 'shift' is a vector passed in
1955
Similar for other pairs
1956
*/
1957
#define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1958
{ \
1959
in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1960
in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1961
in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1962
in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1963
}
1964
#define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1965
1966
#define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
1967
{ \
1968
in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
1969
in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
1970
in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
1971
in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
1972
}
1973
#define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1974
#define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1975
1976
#define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
1977
{ \
1978
SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
1979
SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
1980
}
1981
#define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1982
#define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1983
1984
/* Description : Shift right arithmetic rounded halfwords
1985
Arguments : Inputs - in0, in1, shift
1986
Outputs - in0, in1, (in place)
1987
Return Type - as per RTYPE
1988
Details : Each element of vector 'in0' is shifted right arithmetic by
1989
number of bits respective element holds in vector 'shift'.
1990
The last discarded bit is added to shifted value for rounding
1991
and the result is in place written to 'in0'
1992
Here, 'shift' is a vector passed in
1993
Similar for other pairs
1994
*/
1995
#define SRAR_H2(RTYPE, in0, in1, shift) \
1996
{ \
1997
in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
1998
in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
1999
}
2000
#define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2001
#define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2002
2003
#define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2004
{ \
2005
SRAR_H2(RTYPE, in0, in1, shift) \
2006
in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2007
}
2008
#define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2009
2010
#define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2011
{ \
2012
SRAR_H2(RTYPE, in0, in1, shift) \
2013
SRAR_H2(RTYPE, in2, in3, shift) \
2014
}
2015
#define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2016
#define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2017
2018
/* Description : Shift right arithmetic rounded words
2019
Arguments : Inputs - in0, in1, shift
2020
Outputs - in0, in1, (in place)
2021
Return Type - as per RTYPE
2022
Details : Each element of vector 'in0' is shifted right arithmetic by
2023
number of bits respective element holds in vector 'shift'.
2024
The last discarded bit is added to shifted value for rounding
2025
and the result is in place written to 'in0'
2026
Here, 'shift' is a vector passed in
2027
Similar for other pairs
2028
*/
2029
#define SRAR_W2(RTYPE, in0, in1, shift) \
2030
{ \
2031
in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2032
in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2033
}
2034
#define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2035
2036
#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2037
{ \
2038
SRAR_W2(RTYPE, in0, in1, shift) \
2039
SRAR_W2(RTYPE, in2, in3, shift) \
2040
}
2041
#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2042
2043
/* Description : Shift right arithmetic rounded (immediate)
2044
Arguments : Inputs - in0, in1, in2, in3, shift
2045
Outputs - in0, in1, in2, in3 (in place)
2046
Return Type - as per RTYPE
2047
Details : Each element of vector 'in0' is shifted right arithmetic by
2048
value in 'shift'.
2049
The last discarded bit is added to shifted value for rounding
2050
and the result is in place written to 'in0'
2051
Similar for other pairs
2052
*/
2053
#define SRARI_H2(RTYPE, in0, in1, shift) \
2054
{ \
2055
in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2056
in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2057
}
2058
#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2059
#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2060
2061
#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2062
{ \
2063
SRARI_H2(RTYPE, in0, in1, shift); \
2064
SRARI_H2(RTYPE, in2, in3, shift); \
2065
}
2066
#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2067
#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2068
2069
/* Description : Shift right arithmetic rounded (immediate)
2070
Arguments : Inputs - in0, in1, shift
2071
Outputs - in0, in1 (in place)
2072
Return Type - as per RTYPE
2073
Details : Each element of vector 'in0' is shifted right arithmetic by
2074
value in 'shift'.
2075
The last discarded bit is added to shifted value for rounding
2076
and the result is in place written to 'in0'
2077
Similar for other pairs
2078
*/
2079
#define SRARI_W2(RTYPE, in0, in1, shift) \
2080
{ \
2081
in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2082
in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2083
}
2084
#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2085
2086
#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2087
{ \
2088
SRARI_W2(RTYPE, in0, in1, shift); \
2089
SRARI_W2(RTYPE, in2, in3, shift); \
2090
}
2091
#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2092
#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2093
2094
/* Description : Multiplication of pairs of vectors
2095
Arguments : Inputs - in0, in1, in2, in3
2096
Outputs - out0, out1
2097
Details : Each element from 'in0' is multiplied with elements from 'in1'
2098
and result is written to 'out0'
2099
Similar for other pairs
2100
*/
2101
#define MUL2(in0, in1, in2, in3, out0, out1) \
2102
{ \
2103
out0 = in0 * in1; \
2104
out1 = in2 * in3; \
2105
}
2106
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2107
{ \
2108
MUL2(in0, in1, in2, in3, out0, out1); \
2109
MUL2(in4, in5, in6, in7, out2, out3); \
2110
}
2111
2112
/* Description : Addition of 2 pairs of vectors
2113
Arguments : Inputs - in0, in1, in2, in3
2114
Outputs - out0, out1
2115
Details : Each element from 2 pairs vectors is added and 2 results are
2116
produced
2117
*/
2118
#define ADD2(in0, in1, in2, in3, out0, out1) \
2119
{ \
2120
out0 = in0 + in1; \
2121
out1 = in2 + in3; \
2122
}
2123
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2124
{ \
2125
ADD2(in0, in1, in2, in3, out0, out1); \
2126
ADD2(in4, in5, in6, in7, out2, out3); \
2127
}
2128
2129
/* Description : Subtraction of 2 pairs of vectors
2130
Arguments : Inputs - in0, in1, in2, in3
2131
Outputs - out0, out1
2132
Details : Each element from 2 pairs vectors is subtracted and 2 results
2133
are produced
2134
*/
2135
#define SUB2(in0, in1, in2, in3, out0, out1) \
2136
{ \
2137
out0 = in0 - in1; \
2138
out1 = in2 - in3; \
2139
}
2140
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2141
{ \
2142
out0 = in0 - in1; \
2143
out1 = in2 - in3; \
2144
out2 = in4 - in5; \
2145
out3 = in6 - in7; \
2146
}
2147
2148
/* Description : Sign extend byte elements from right half of the vector
2149
Arguments : Input - in (byte vector)
2150
Output - out (sign extended halfword vector)
2151
Return Type - signed halfword
2152
Details : Sign bit of byte elements from input vector 'in' is
2153
extracted and interleaved with same vector 'in' to generate
2154
8 halfword elements keeping sign intact
2155
*/
2156
#define UNPCK_R_SB_SH(in, out) \
2157
{ \
2158
v16i8 sign_m; \
2159
\
2160
sign_m = __msa_clti_s_b((v16i8) in, 0); \
2161
out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \
2162
}
2163
2164
/* Description : Sign extend halfword elements from right half of the vector
2165
Arguments : Inputs - in (input halfword vector)
2166
Outputs - out (sign extended word vectors)
2167
Return Type - signed word
2168
Details : Sign bit of halfword elements from input vector 'in' is
2169
extracted and interleaved with same vector 'in0' to generate
2170
4 word elements keeping sign intact
2171
*/
2172
#define UNPCK_R_SH_SW(in, out) \
2173
{ \
2174
v8i16 sign_m; \
2175
\
2176
sign_m = __msa_clti_s_h((v8i16) in, 0); \
2177
out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2178
}
2179
2180
/* Description : Sign extend byte elements from input vector and return
2181
halfword results in pair of vectors
2182
Arguments : Inputs - in (1 input byte vector)
2183
Outputs - out0, out1 (sign extended 2 halfword vectors)
2184
Return Type - signed halfword
2185
Details : Sign bit of byte elements from input vector 'in' is
2186
extracted and interleaved right with same vector 'in0' to
2187
generate 8 signed halfword elements in 'out0'
2188
Then interleaved left with same vector 'in0' to
2189
generate 8 signed halfword elements in 'out1'
2190
*/
2191
#define UNPCK_SB_SH(in, out0, out1) \
2192
{ \
2193
v16i8 tmp_m; \
2194
\
2195
tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2196
ILVRL_B2_SH(tmp_m, in, out0, out1); \
2197
}
2198
2199
/* Description : Zero extend unsigned byte elements to halfword elements
2200
Arguments : Inputs - in (1 input unsigned byte vector)
2201
Outputs - out0, out1 (unsigned 2 halfword vectors)
2202
Return Type - signed halfword
2203
Details : Zero extended right half of vector is returned in 'out0'
2204
Zero extended left half of vector is returned in 'out1'
2205
*/
2206
#define UNPCK_UB_SH(in, out0, out1) \
2207
{ \
2208
v16i8 zero_m = { 0 }; \
2209
\
2210
ILVRL_B2_SH(zero_m, in, out0, out1); \
2211
}
2212
2213
/* Description : Sign extend halfword elements from input vector and return
2214
result in pair of vectors
2215
Arguments : Inputs - in (1 input halfword vector)
2216
Outputs - out0, out1 (sign extended 2 word vectors)
2217
Return Type - signed word
2218
Details : Sign bit of halfword elements from input vector 'in' is
2219
extracted and interleaved right with same vector 'in0' to
2220
generate 4 signed word elements in 'out0'
2221
Then interleaved left with same vector 'in0' to
2222
generate 4 signed word elements in 'out1'
2223
*/
2224
#define UNPCK_SH_SW(in, out0, out1) \
2225
{ \
2226
v8i16 tmp_m; \
2227
\
2228
tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2229
ILVRL_H2_SW(tmp_m, in, out0, out1); \
2230
}
2231
2232
/* Description : Swap two variables
2233
Arguments : Inputs - in0, in1
2234
Outputs - in0, in1 (in-place)
2235
Details : Swapping of two input variables using xor
2236
*/
2237
#define SWAP(in0, in1) \
2238
{ \
2239
in0 = in0 ^ in1; \
2240
in1 = in0 ^ in1; \
2241
in0 = in0 ^ in1; \
2242
}
2243
2244
/* Description : Butterfly of 4 input vectors
2245
Arguments : Inputs - in0, in1, in2, in3
2246
Outputs - out0, out1, out2, out3
2247
Details : Butterfly operation
2248
*/
2249
#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2250
{ \
2251
out0 = in0 + in3; \
2252
out1 = in1 + in2; \
2253
\
2254
out2 = in1 - in2; \
2255
out3 = in0 - in3; \
2256
}
2257
2258
/* Description : Butterfly of 8 input vectors
2259
Arguments : Inputs - in0 ... in7
2260
Outputs - out0 .. out7
2261
Details : Butterfly operation
2262
*/
2263
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2264
out0, out1, out2, out3, out4, out5, out6, out7) \
2265
{ \
2266
out0 = in0 + in7; \
2267
out1 = in1 + in6; \
2268
out2 = in2 + in5; \
2269
out3 = in3 + in4; \
2270
\
2271
out4 = in3 - in4; \
2272
out5 = in2 - in5; \
2273
out6 = in1 - in6; \
2274
out7 = in0 - in7; \
2275
}
2276
2277
/* Description : Butterfly of 16 input vectors
2278
Arguments : Inputs - in0 ... in15
2279
Outputs - out0 .. out15
2280
Details : Butterfly operation
2281
*/
2282
#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2283
in8, in9, in10, in11, in12, in13, in14, in15, \
2284
out0, out1, out2, out3, out4, out5, out6, out7, \
2285
out8, out9, out10, out11, out12, out13, out14, out15) \
2286
{ \
2287
out0 = in0 + in15; \
2288
out1 = in1 + in14; \
2289
out2 = in2 + in13; \
2290
out3 = in3 + in12; \
2291
out4 = in4 + in11; \
2292
out5 = in5 + in10; \
2293
out6 = in6 + in9; \
2294
out7 = in7 + in8; \
2295
\
2296
out8 = in7 - in8; \
2297
out9 = in6 - in9; \
2298
out10 = in5 - in10; \
2299
out11 = in4 - in11; \
2300
out12 = in3 - in12; \
2301
out13 = in2 - in13; \
2302
out14 = in1 - in14; \
2303
out15 = in0 - in15; \
2304
}
2305
2306
/* Description : Transposes input 4x4 byte block
2307
Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2308
Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2309
Return Type - unsigned byte
2310
Details :
2311
*/
2312
#define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2313
{ \
2314
v16i8 zero_m = { 0 }; \
2315
v16i8 s0_m, s1_m, s2_m, s3_m; \
2316
\
2317
ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2318
ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2319
\
2320
out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2321
out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2322
out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2323
out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2324
}
2325
2326
/* Description : Transposes input 8x4 byte block into 4x8
2327
Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2328
Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2329
Return Type - as per RTYPE
2330
Details :
2331
*/
2332
#define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2333
out0, out1, out2, out3) \
2334
{ \
2335
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2336
\
2337
ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2338
tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2339
ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2340
\
2341
tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2342
ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2343
\
2344
ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2345
out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2346
out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2347
}
2348
#define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2349
#define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2350
2351
/* Description : Transposes input 8x8 byte block
2352
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2353
(input 8x8 byte block)
2354
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2355
(output 8x8 byte block)
2356
Return Type - as per RTYPE
2357
Details :
2358
*/
2359
#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2360
out0, out1, out2, out3, out4, out5, out6, out7) \
2361
{ \
2362
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2363
v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2364
v16i8 zeros = { 0 }; \
2365
\
2366
ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2367
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2368
ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2369
ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2370
ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2371
ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2372
SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \
2373
8, out1, out3, out5, out7); \
2374
}
2375
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2376
#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2377
2378
/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2379
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2380
in8, in9, in10, in11, in12, in13, in14, in15
2381
Outputs - out0, out1, out2, out3
2382
Return Type - unsigned byte
2383
Details :
2384
*/
2385
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2386
in8, in9, in10, in11, in12, in13, in14, in15, \
2387
out0, out1, out2, out3) \
2388
{ \
2389
v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2390
\
2391
ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2392
out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2393
\
2394
ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2395
out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2396
\
2397
ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2398
\
2399
tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2400
ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2401
\
2402
tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2403
ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2404
out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2405
out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2406
\
2407
tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2408
tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2409
out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2410
out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2411
}
2412
2413
/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2414
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2415
in8, in9, in10, in11, in12, in13, in14, in15
2416
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2417
Return Type - unsigned byte
2418
Details :
2419
*/
2420
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2421
in8, in9, in10, in11, in12, in13, in14, in15, \
2422
out0, out1, out2, out3, out4, out5, out6, out7) \
2423
{ \
2424
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2425
v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2426
\
2427
ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2428
ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2429
ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2430
ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2431
\
2432
tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2433
tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2434
tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2435
tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2436
out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2437
tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2438
out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2439
tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2440
\
2441
ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2442
out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2443
out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2444
\
2445
tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2446
tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2447
out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2448
out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2449
\
2450
ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2451
out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2452
out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2453
\
2454
tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2455
tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2456
out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2457
out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2458
}
2459
2460
/* Description : Transposes 4x4 block with half word elements in vectors
2461
Arguments : Inputs - in0, in1, in2, in3
2462
Outputs - out0, out1, out2, out3
2463
Return Type - signed halfword
2464
Details :
2465
*/
2466
#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2467
{ \
2468
v8i16 s0_m, s1_m; \
2469
\
2470
ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2471
ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2472
out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2473
out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2474
}
2475
2476
/* Description : Transposes 8x8 block with half word elements in vectors
2477
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2478
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2479
Return Type - as per RTYPE
2480
Details :
2481
*/
2482
#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2483
out0, out1, out2, out3, out4, out5, out6, out7) \
2484
{ \
2485
v8i16 s0_m, s1_m; \
2486
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2487
v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2488
\
2489
ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2490
ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2491
ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2492
ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2493
ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2494
ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2495
ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2496
ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2497
PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2498
tmp3_m, tmp7_m, out0, out2, out4, out6); \
2499
out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2500
out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2501
out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2502
out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2503
}
2504
#define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2505
#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2506
2507
/* Description : Transposes 4x4 block with word elements in vectors
2508
Arguments : Inputs - in0, in1, in2, in3
2509
Outputs - out0, out1, out2, out3
2510
Return Type - signed word
2511
Details :
2512
*/
2513
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2514
{ \
2515
v4i32 s0_m, s1_m, s2_m, s3_m; \
2516
\
2517
ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2518
ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2519
\
2520
out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2521
out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2522
out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2523
out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2524
}
2525
2526
/* Description : Average byte elements from pair of vectors and store 8x4 byte
2527
block in destination memory
2528
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2529
Details : Each byte element from input vector pair 'in0' and 'in1' are
2530
averaged (a + b)/2 and stored in 'tmp0_m'
2531
Each byte element from input vector pair 'in2' and 'in3' are
2532
averaged (a + b)/2 and stored in 'tmp1_m'
2533
Each byte element from input vector pair 'in4' and 'in5' are
2534
averaged (a + b)/2 and stored in 'tmp2_m'
2535
Each byte element from input vector pair 'in6' and 'in7' are
2536
averaged (a + b)/2 and stored in 'tmp3_m'
2537
The half vector results from all 4 vectors are stored in
2538
destination memory as 8x4 byte block
2539
*/
2540
#define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2541
{ \
2542
uint64_t out0_m, out1_m, out2_m, out3_m; \
2543
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2544
\
2545
tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2546
tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2547
tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2548
tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2549
\
2550
out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2551
out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2552
out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2553
out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2554
SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2555
}
2556
2557
/* Description : Average byte elements from pair of vectors and store 16x4 byte
2558
block in destination memory
2559
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2560
Details : Each byte element from input vector pair 'in0' and 'in1' are
2561
averaged (a + b)/2 and stored in 'tmp0_m'
2562
Each byte element from input vector pair 'in2' and 'in3' are
2563
averaged (a + b)/2 and stored in 'tmp1_m'
2564
Each byte element from input vector pair 'in4' and 'in5' are
2565
averaged (a + b)/2 and stored in 'tmp2_m'
2566
Each byte element from input vector pair 'in6' and 'in7' are
2567
averaged (a + b)/2 and stored in 'tmp3_m'
2568
The results from all 4 vectors are stored in destination
2569
memory as 16x4 byte block
2570
*/
2571
#define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2572
{ \
2573
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2574
\
2575
tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2576
tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2577
tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2578
tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2579
\
2580
ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2581
}
2582
2583
/* Description : Average rounded byte elements from pair of vectors and store
2584
8x4 byte block in destination memory
2585
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2586
Details : Each byte element from input vector pair 'in0' and 'in1' are
2587
average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2588
Each byte element from input vector pair 'in2' and 'in3' are
2589
average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2590
Each byte element from input vector pair 'in4' and 'in5' are
2591
average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2592
Each byte element from input vector pair 'in6' and 'in7' are
2593
average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2594
The half vector results from all 4 vectors are stored in
2595
destination memory as 8x4 byte block
2596
*/
2597
#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2598
{ \
2599
uint64_t out0_m, out1_m, out2_m, out3_m; \
2600
v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2601
\
2602
AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2603
tp0_m, tp1_m, tp2_m, tp3_m); \
2604
\
2605
out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2606
out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2607
out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2608
out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2609
SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2610
}
2611
2612
/* Description : Average rounded byte elements from pair of vectors and store
2613
16x4 byte block in destination memory
2614
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2615
Details : Each byte element from input vector pair 'in0' and 'in1' are
2616
average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2617
Each byte element from input vector pair 'in2' and 'in3' are
2618
average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2619
Each byte element from input vector pair 'in4' and 'in5' are
2620
average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2621
Each byte element from input vector pair 'in6' and 'in7' are
2622
average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2623
The vector results from all 4 vectors are stored in
2624
destination memory as 16x4 byte block
2625
*/
2626
#define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2627
{ \
2628
v16u8 t0_m, t1_m, t2_m, t3_m; \
2629
\
2630
AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2631
t0_m, t1_m, t2_m, t3_m); \
2632
ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2633
}
2634
2635
/* Description : Average rounded byte elements from pair of vectors,
2636
average rounded with destination and store 8x4 byte block
2637
in destination memory
2638
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2639
Details : Each byte element from input vector pair 'in0' and 'in1' are
2640
average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2641
Each byte element from input vector pair 'in2' and 'in3' are
2642
average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2643
Each byte element from input vector pair 'in4' and 'in5' are
2644
average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2645
Each byte element from input vector pair 'in6' and 'in7' are
2646
average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2647
The half vector results from all 4 vectors are stored in
2648
destination memory as 8x4 byte block
2649
*/
2650
#define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2651
pdst, stride) \
2652
{ \
2653
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2654
v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2655
\
2656
LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2657
AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2658
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2659
AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2660
dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2661
}
2662
2663
/* Description : Average rounded byte elements from pair of vectors,
2664
average rounded with destination and store 16x4 byte block
2665
in destination memory
2666
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2667
Details : Each byte element from input vector pair 'in0' and 'in1' are
2668
average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2669
Each byte element from input vector pair 'in2' and 'in3' are
2670
average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2671
Each byte element from input vector pair 'in4' and 'in5' are
2672
average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2673
Each byte element from input vector pair 'in6' and 'in7' are
2674
average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2675
The vector results from all 4 vectors are stored in
2676
destination memory as 16x4 byte block
2677
*/
2678
#define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2679
pdst, stride) \
2680
{ \
2681
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2682
v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2683
\
2684
LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2685
AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2686
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2687
AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2688
dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2689
}
2690
2691
/* Description : Add block 4x4
2692
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2693
Details : Least significant 4 bytes from each input vector are added to
2694
the destination bytes, clipped between 0-255 and then stored.
2695
*/
2696
#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2697
{ \
2698
uint32_t src0_m, src1_m, src2_m, src3_m; \
2699
uint32_t out0_m, out1_m, out2_m, out3_m; \
2700
v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2701
v16i8 dst0_m = { 0 }; \
2702
v16i8 dst1_m = { 0 }; \
2703
v16i8 zero_m = { 0 }; \
2704
\
2705
ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2706
LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2707
INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2708
INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2709
ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2710
ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2711
CLIP_SH2_0_255(res0_m, res1_m); \
2712
PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2713
\
2714
out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2715
out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2716
out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2717
out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2718
SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2719
}
2720
2721
/* Description : Dot product and addition of 3 signed halfword input vectors
2722
Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2723
Outputs - out0_m
2724
Return Type - signed halfword
2725
Details : Dot product of 'in0' with 'coeff0'
2726
Dot product of 'in1' with 'coeff1'
2727
Dot product of 'in2' with 'coeff2'
2728
Addition of all the 3 vector results
2729
2730
out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2731
*/
2732
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2733
( { \
2734
v8i16 out0_m; \
2735
\
2736
out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2737
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2738
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
2739
\
2740
out0_m; \
2741
} )
2742
2743
/* Description : Pack even elements of input vectors & xor with 128
2744
Arguments : Inputs - in0, in1
2745
Outputs - out_m
2746
Return Type - unsigned byte
2747
Details : Signed byte even elements from 'in0' and 'in1' are packed
2748
together in one vector and the resulted vector is xor'ed with
2749
128 to shift the range from signed to unsigned byte
2750
*/
2751
#define PCKEV_XORI128_UB(in0, in1) \
2752
( { \
2753
v16u8 out_m; \
2754
out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2755
out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2756
out_m; \
2757
} )
2758
2759
/* Description : Converts inputs to unsigned bytes, interleave, average & store
2760
as 8x4 unsigned byte block
2761
Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2762
*/
2763
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2764
dst0, dst1, pdst, stride) \
2765
{ \
2766
v16u8 tmp0_m, tmp1_m; \
2767
uint8_t *pdst_m = (uint8_t *) (pdst); \
2768
\
2769
tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2770
tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2771
AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2772
ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
2773
}
2774
2775
/* Description : Pack even byte elements, extract 0 & 2 index words from pair
2776
of results and store 4 words in destination memory as per
2777
stride
2778
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2779
*/
2780
#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2781
{ \
2782
uint32_t out0_m, out1_m, out2_m, out3_m; \
2783
v16i8 tmp0_m, tmp1_m; \
2784
\
2785
PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2786
\
2787
out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2788
out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2789
out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2790
out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2791
\
2792
SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2793
}
2794
2795
/* Description : Pack even byte elements and store byte vector in destination
2796
memory
2797
Arguments : Inputs - in0, in1, pdst
2798
*/
2799
#define PCKEV_ST_SB(in0, in1, pdst) \
2800
{ \
2801
v16i8 tmp_m; \
2802
tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2803
ST_SB(tmp_m, (pdst)); \
2804
}
2805
2806
/* Description : Horizontal 2 tap filter kernel code
2807
Arguments : Inputs - in0, in1, mask, coeff, shift
2808
*/
2809
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2810
( { \
2811
v16i8 tmp0_m; \
2812
v8u16 tmp1_m; \
2813
\
2814
tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2815
tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2816
tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2817
tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2818
\
2819
tmp1_m; \
2820
} )
2821
#endif
/* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */
Generated on Tue May 12 2026 19:23:40 for FFmpeg by
1.8.17