mirror of
https://github.com/nodejs/node.git
synced 2025-04-28 21:46:48 +00:00

PR-URL: https://github.com/nodejs/node/pull/57939 Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
66421 lines
2.6 MiB
66421 lines
2.6 MiB
/* auto-generated on 2025-04-14 21:04:55 -0400. Do not edit! */
|
|
/* begin file src/simdutf.cpp */
|
|
#include "simdutf.h"
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
// We include base64_tables once.
|
|
/* begin file src/tables/base64_tables.h */
|
|
#ifndef SIMDUTF_BASE64_TABLES_H
|
|
#define SIMDUTF_BASE64_TABLES_H
|
|
#include <array>
|
|
#include <cstdint>
|
|
|
|
namespace simdutf {
|
|
namespace {
|
|
namespace tables {
|
|
namespace base64 {
|
|
namespace base64_default {
|
|
|
|
const char e0[256] = {
|
|
'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
|
|
'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
|
|
'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
|
|
'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
|
|
'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
|
|
'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
|
|
'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
|
|
'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
|
|
'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
|
|
'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
|
|
'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
|
|
'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
|
|
't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
|
|
'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
|
|
'0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
|
|
'4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
|
|
'8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/',
|
|
'/'};
|
|
|
|
const char e1[256] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
|
|
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
|
|
'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
|
|
't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
|
|
'8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
|
|
'4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
|
|
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
|
|
'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
|
|
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
|
|
'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
|
|
'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
|
|
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
|
|
'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
|
|
'/'};
|
|
|
|
const char e2[256] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
|
|
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
|
|
'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
|
|
't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
|
|
'8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
|
|
'4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
|
|
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
|
|
'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
|
|
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
|
|
'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
|
|
'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
|
|
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
|
|
'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
|
|
'/'};
|
|
|
|
const uint32_t d0[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
|
|
0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
|
|
0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
|
|
0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
|
|
0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
|
|
0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
|
|
0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
|
|
0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
|
|
0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
|
|
0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
|
|
0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
|
|
const uint32_t d1[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
|
|
0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
|
|
0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
|
|
0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
|
|
0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
|
|
0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
|
|
0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
|
|
0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
|
|
0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
|
|
0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
|
|
0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
|
|
const uint32_t d2[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
|
|
0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
|
|
0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
|
|
0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
|
|
0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
|
|
0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
|
|
0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
|
|
0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
|
|
0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
|
|
0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
|
|
0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
|
|
const uint32_t d3[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
|
|
0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
|
|
0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
|
|
0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
|
|
0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
|
|
0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
|
|
0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
|
|
0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
|
|
0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
|
|
0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
|
|
0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
} // namespace base64_default
|
|
|
|
namespace base64_url {
|
|
|
|
const char e0[256] = {
|
|
'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
|
|
'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
|
|
'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
|
|
'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
|
|
'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
|
|
'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
|
|
'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
|
|
'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
|
|
'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
|
|
'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
|
|
'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
|
|
'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
|
|
't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
|
|
'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
|
|
'0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
|
|
'4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
|
|
'8', '8', '8', '8', '9', '9', '9', '9', '-', '-', '-', '-', '_', '_', '_',
|
|
'_'};
|
|
|
|
const char e1[256] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
|
|
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
|
|
'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
|
|
't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
|
|
'8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
|
|
'4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
|
|
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
|
|
'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
|
|
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
|
|
'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
|
|
'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
|
|
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
|
|
'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
|
|
'_'};
|
|
|
|
const char e2[256] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
|
|
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
|
|
'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
|
|
't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
|
|
'8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
|
|
'4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
|
|
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
|
|
'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
|
|
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
|
|
'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
|
|
'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
|
|
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
|
|
'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
|
|
'_'};
|
|
|
|
const uint32_t d0[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff,
|
|
0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
|
|
0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
|
|
0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
|
|
0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
|
|
0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
|
|
0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
|
|
0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
|
|
0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
|
|
0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
|
|
0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
|
|
0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
const uint32_t d1[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff,
|
|
0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
|
|
0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
|
|
0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
|
|
0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
|
|
0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
|
|
0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
|
|
0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
|
|
0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
|
|
0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
|
|
0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
|
|
0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
const uint32_t d2[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff,
|
|
0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
|
|
0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
|
|
0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
|
|
0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
|
|
0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
|
|
0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
|
|
0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
|
|
0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
|
|
0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
|
|
0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
|
|
0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
const uint32_t d3[256] = {
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff,
|
|
0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
|
|
0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
|
|
0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
|
|
0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
|
|
0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
|
|
0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
|
|
0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
|
|
0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
|
|
0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
|
|
0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
|
|
0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
|
|
0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
|
|
0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
|
|
} // namespace base64_url
|
|
const uint64_t thintable_epi8[256] = {
|
|
0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
|
|
0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
|
|
0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
|
|
0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
|
|
0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
|
|
0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
|
|
0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
|
|
0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
|
|
0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
|
|
0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
|
|
0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
|
|
0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
|
|
0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
|
|
0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
|
|
0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
|
|
0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
|
|
0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
|
|
0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
|
|
0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
|
|
0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
|
|
0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
|
|
0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
|
|
0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
|
|
0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
|
|
0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
|
|
0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
|
|
0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
|
|
0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
|
|
0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
|
|
0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
|
|
0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
|
|
0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
|
|
0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
|
|
0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
|
|
0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
|
|
0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
|
|
0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
|
|
0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
|
|
0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
|
|
0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
|
|
0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
|
|
0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
|
|
0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
|
|
0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
|
|
0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
|
|
0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
|
|
0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
|
|
0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
|
|
0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
|
|
0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
|
|
0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
|
|
0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
|
|
0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
|
|
0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
|
|
0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
|
|
0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
|
|
0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
|
|
0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
|
|
0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
|
|
0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
|
|
0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
|
|
0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
|
|
0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
|
|
0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
|
|
0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
|
|
0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
|
|
0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
|
|
0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
|
|
0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
|
|
0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
|
|
0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
|
|
0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
|
|
0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
|
|
0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
|
|
0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
|
|
0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
|
|
0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
|
|
0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
|
|
0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
|
|
0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
|
|
0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
|
|
0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
|
|
0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
|
|
0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
|
|
0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
|
|
0x0000000000000000,
|
|
};
|
|
|
|
const uint8_t pshufb_combine_table[272] = {
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
|
|
0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
|
|
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
|
|
0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
|
|
0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
|
|
0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
|
|
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
|
|
0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
|
|
0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
};
|
|
|
|
const unsigned char BitsSetTable256mul2[256] = {
|
|
0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4,
|
|
6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6,
|
|
6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6,
|
|
8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8,
|
|
6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10,
|
|
12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8,
|
|
8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4,
|
|
6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10,
|
|
6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8,
|
|
10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12,
|
|
12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6,
|
|
8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10,
|
|
8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12,
|
|
14, 10, 12, 12, 14, 12, 14, 14, 16};
|
|
|
|
constexpr uint8_t to_base64_value[] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255,
|
|
255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255,
|
|
255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
|
|
25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33,
|
|
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
|
|
49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255};
|
|
|
|
constexpr uint8_t to_base64_url_value[] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255,
|
|
255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
|
|
25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33,
|
|
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
|
|
49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
|
255};
|
|
static_assert(sizeof(to_base64_value) == 256,
|
|
"to_base64_value must have 256 elements");
|
|
static_assert(sizeof(to_base64_url_value) == 256,
|
|
"to_base64_url_value must have 256 elements");
|
|
static_assert(to_base64_value[uint8_t(' ')] == 64,
|
|
"space must be == 64 in to_base64_value");
|
|
static_assert(to_base64_url_value[uint8_t(' ')] == 64,
|
|
"space must be == 64 in to_base64_url_value");
|
|
static_assert(to_base64_value[uint8_t('\t')] == 64,
|
|
"tab must be == 64 in to_base64_value");
|
|
static_assert(to_base64_url_value[uint8_t('\t')] == 64,
|
|
"tab must be == 64 in to_base64_url_value");
|
|
static_assert(to_base64_value[uint8_t('\r')] == 64,
|
|
"cr must be == 64 in to_base64_value");
|
|
static_assert(to_base64_url_value[uint8_t('\r')] == 64,
|
|
"cr must be == 64 in to_base64_url_value");
|
|
static_assert(to_base64_value[uint8_t('\n')] == 64,
|
|
"lf must be == 64 in to_base64_value");
|
|
static_assert(to_base64_url_value[uint8_t('\n')] == 64,
|
|
"lf must be == 64 in to_base64_url_value");
|
|
static_assert(to_base64_value[uint8_t('\f')] == 64,
|
|
"ff must be == 64 in to_base64_value");
|
|
static_assert(to_base64_url_value[uint8_t('\f')] == 64,
|
|
"ff must be == 64 in to_base64_url_value");
|
|
static_assert(to_base64_value[uint8_t('+')] == 62,
|
|
"+ must be == 62 in to_base64_value");
|
|
static_assert(to_base64_url_value[uint8_t('-')] == 62,
|
|
"- must be == 62 in to_base64_url_value");
|
|
static_assert(to_base64_value[uint8_t('/')] == 63,
|
|
"/ must be == 62 in to_base64_value");
|
|
static_assert(to_base64_url_value[uint8_t('_')] == 63,
|
|
"_ must be == 62 in to_base64_url_value");
|
|
} // namespace base64
|
|
} // namespace tables
|
|
} // unnamed namespace
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_BASE64_TABLES_H
|
|
/* end file src/tables/base64_tables.h */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
/* begin file src/encoding_types.cpp */
|
|
|
|
namespace simdutf {
|
|
bool match_system(endianness e) {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return e == endianness::BIG;
|
|
#else
|
|
return e == endianness::LITTLE;
|
|
#endif
|
|
}
|
|
|
|
std::string to_string(encoding_type bom) {
|
|
switch (bom) {
|
|
case UTF16_LE:
|
|
return "UTF16 little-endian";
|
|
case UTF16_BE:
|
|
return "UTF16 big-endian";
|
|
case UTF32_LE:
|
|
return "UTF32 little-endian";
|
|
case UTF32_BE:
|
|
return "UTF32 big-endian";
|
|
case UTF8:
|
|
return "UTF8";
|
|
case unspecified:
|
|
return "unknown";
|
|
default:
|
|
return "error";
|
|
}
|
|
}
|
|
|
|
namespace BOM {
|
|
// Note that BOM for UTF8 is discouraged.
|
|
encoding_type check_bom(const uint8_t *byte, size_t length) {
|
|
if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
|
|
if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
|
|
return encoding_type::UTF32_LE;
|
|
} else {
|
|
return encoding_type::UTF16_LE;
|
|
}
|
|
} else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
|
|
return encoding_type::UTF16_BE;
|
|
} else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and
|
|
byte[2] == 0xfe and byte[3] == 0xff) {
|
|
return encoding_type::UTF32_BE;
|
|
} else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and
|
|
byte[2] == 0xbf) {
|
|
return encoding_type::UTF8;
|
|
}
|
|
return encoding_type::unspecified;
|
|
}
|
|
|
|
encoding_type check_bom(const char *byte, size_t length) {
|
|
return check_bom(reinterpret_cast<const uint8_t *>(byte), length);
|
|
}
|
|
|
|
size_t bom_byte_size(encoding_type bom) {
|
|
switch (bom) {
|
|
case UTF16_LE:
|
|
return 2;
|
|
case UTF16_BE:
|
|
return 2;
|
|
case UTF32_LE:
|
|
return 4;
|
|
case UTF32_BE:
|
|
return 4;
|
|
case UTF8:
|
|
return 3;
|
|
case unspecified:
|
|
return 0;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
} // namespace BOM
|
|
} // namespace simdutf
|
|
/* end file src/encoding_types.cpp */
|
|
/* begin file src/error.cpp */
|
|
namespace simdutf {
|
|
// deliberately empty
|
|
}
|
|
/* end file src/error.cpp */
|
|
// The large tables should be included once and they
|
|
// should not depend on a kernel.
|
|
/* begin file src/tables/utf8_to_utf16_tables.h */
|
|
#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
|
|
#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
|
|
#include <cstdint>
|
|
|
|
namespace simdutf {
|
|
namespace {
|
|
namespace tables {
|
|
namespace utf8_to_utf16 {
|
|
/**
|
|
* utf8bigindex uses about 8 kB
|
|
* shufutf8 uses about 3344 B
|
|
*
|
|
* So we use a bit over 11 kB. It would be
|
|
* easy to save about 4 kB by only
|
|
* storing the index in utf8bigindex, and
|
|
* deriving the consumed bytes otherwise.
|
|
* However, this may come at a significant (10% to 20%)
|
|
* performance penalty.
|
|
*/
|
|
|
|
const uint8_t shufutf8[209][16] = {
|
|
{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
|
|
/* number of two bytes : 64 */
|
|
/* number of two + three bytes : 145 */
|
|
/* number of two + three + four bytes : 209 */
|
|
const uint8_t utf8bigindex[4096][2] = {
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4},
|
|
{161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12},
|
|
{150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5},
|
|
{177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {209, 12},
|
|
{148, 6}, {209, 12}, {151, 6}, {163, 6}, {66, 6}, {209, 12}, {154, 6},
|
|
{166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12},
|
|
{157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5},
|
|
{193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7},
|
|
{164, 7}, {145, 3}, {209, 12}, {155, 7}, {167, 7}, {69, 7}, {179, 7},
|
|
{75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {170, 7}, {71, 7},
|
|
{182, 7}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7},
|
|
{67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12},
|
|
{173, 7}, {148, 6}, {185, 7}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
|
|
{85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7},
|
|
{209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6},
|
|
{4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7},
|
|
{32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {145, 3}, {209, 12}, {156, 8}, {168, 8}, {146, 4},
|
|
{180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {171, 8},
|
|
{72, 8}, {183, 8}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8},
|
|
{102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
|
|
{209, 12}, {174, 8}, {148, 6}, {186, 8}, {80, 8}, {98, 8}, {66, 6},
|
|
{198, 8}, {86, 8}, {104, 8}, {68, 6}, {122, 8}, {74, 6}, {92, 6},
|
|
{3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {76, 6},
|
|
{94, 6}, {5, 8}, {193, 6}, {82, 6}, {100, 6}, {9, 8}, {118, 6},
|
|
{17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8},
|
|
{69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
|
|
{112, 8}, {71, 7}, {130, 8}, {77, 7}, {95, 7}, {6, 8}, {194, 7},
|
|
{83, 7}, {101, 7}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7},
|
|
{209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7},
|
|
{66, 6}, {197, 7}, {85, 7}, {103, 7}, {12, 8}, {121, 7}, {20, 8},
|
|
{36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
|
|
{24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7},
|
|
{118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12},
|
|
{209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
|
|
{160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5},
|
|
{196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5},
|
|
{64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {187, 9}, {81, 9},
|
|
{99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9},
|
|
{74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6},
|
|
{129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6},
|
|
{67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9},
|
|
{89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4},
|
|
{209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {77, 7}, {95, 7},
|
|
{7, 9}, {194, 7}, {83, 7}, {101, 7}, {11, 9}, {119, 7}, {19, 9},
|
|
{35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9},
|
|
{79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {13, 9},
|
|
{121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
|
|
{70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6},
|
|
{49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
|
|
{205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4},
|
|
{64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8},
|
|
{96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8},
|
|
{73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6},
|
|
{139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8},
|
|
{14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6},
|
|
{110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6},
|
|
{82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7},
|
|
{145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7},
|
|
{93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8},
|
|
{28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8},
|
|
{119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
|
|
{148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
|
|
{56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12},
|
|
{157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7},
|
|
{193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12},
|
|
{149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5},
|
|
{209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5},
|
|
{67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
|
|
{176, 10}, {148, 6}, {188, 10}, {151, 6}, {163, 6}, {66, 6}, {200, 10},
|
|
{154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4},
|
|
{209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6},
|
|
{65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5},
|
|
{91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10},
|
|
{152, 7}, {164, 7}, {145, 3}, {203, 10}, {90, 10}, {108, 10}, {69, 7},
|
|
{126, 10}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {114, 10},
|
|
{71, 7}, {132, 10}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7},
|
|
{101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12},
|
|
{209, 12}, {173, 7}, {148, 6}, {138, 10}, {79, 7}, {97, 7}, {66, 6},
|
|
{197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6},
|
|
{2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6},
|
|
{94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6},
|
|
{16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {145, 3}, {206, 10}, {156, 8}, {168, 8},
|
|
{146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8},
|
|
{116, 10}, {72, 8}, {134, 10}, {78, 8}, {96, 8}, {65, 5}, {195, 8},
|
|
{84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4},
|
|
{209, 12}, {209, 12}, {174, 8}, {148, 6}, {140, 10}, {80, 8}, {98, 8},
|
|
{66, 6}, {198, 8}, {86, 8}, {104, 8}, {15, 10}, {122, 8}, {23, 10},
|
|
{39, 10}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8},
|
|
{27, 10}, {43, 10}, {5, 8}, {193, 6}, {82, 6}, {51, 10}, {9, 8},
|
|
{118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8},
|
|
{106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
|
|
{158, 7}, {112, 8}, {71, 7}, {130, 8}, {29, 10}, {45, 10}, {6, 8},
|
|
{194, 7}, {83, 7}, {53, 10}, {10, 8}, {119, 7}, {18, 8}, {34, 8},
|
|
{1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7},
|
|
{97, 7}, {66, 6}, {197, 7}, {85, 7}, {57, 10}, {12, 8}, {121, 7},
|
|
{20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
|
|
{127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8},
|
|
{8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12},
|
|
{209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4},
|
|
{209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5},
|
|
{65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5},
|
|
{91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {142, 10},
|
|
{81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6},
|
|
{123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9},
|
|
{70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6},
|
|
{100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3},
|
|
{202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7},
|
|
{64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {30, 10},
|
|
{46, 10}, {7, 9}, {194, 7}, {83, 7}, {54, 10}, {11, 9}, {119, 7},
|
|
{19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
|
|
{137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {58, 10},
|
|
{13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6},
|
|
{109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6},
|
|
{82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4},
|
|
{161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9},
|
|
{78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5},
|
|
{120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8},
|
|
{148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8},
|
|
{60, 10}, {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12},
|
|
{157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8},
|
|
{193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7},
|
|
{164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8},
|
|
{75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7},
|
|
{130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9},
|
|
{10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12},
|
|
{173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
|
|
{85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7},
|
|
{209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8},
|
|
{4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7},
|
|
{32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4},
|
|
{209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12},
|
|
{147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5},
|
|
{165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
|
|
{209, 12}, {209, 12}, {148, 6}, {209, 12}, {151, 6}, {163, 6}, {66, 6},
|
|
{209, 12}, {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6},
|
|
{64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6},
|
|
{94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6},
|
|
{73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{192, 11}, {152, 7}, {164, 7}, {145, 3}, {204, 11}, {155, 7}, {167, 7},
|
|
{69, 7}, {179, 7}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
|
|
{170, 7}, {71, 7}, {182, 7}, {77, 7}, {95, 7}, {65, 5}, {194, 7},
|
|
{83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7},
|
|
{209, 12}, {209, 12}, {173, 7}, {148, 6}, {185, 7}, {79, 7}, {97, 7},
|
|
{66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6},
|
|
{92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
|
|
{76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7},
|
|
{118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {207, 11}, {156, 8},
|
|
{168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
|
|
{159, 8}, {117, 11}, {72, 8}, {135, 11}, {78, 8}, {96, 8}, {65, 5},
|
|
{195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5},
|
|
{64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {141, 11}, {80, 8},
|
|
{98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8}, {68, 6}, {122, 8},
|
|
{74, 6}, {92, 6}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6},
|
|
{128, 8}, {76, 6}, {94, 6}, {5, 8}, {193, 6}, {82, 6}, {100, 6},
|
|
{9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8},
|
|
{88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4},
|
|
{209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {77, 7}, {95, 7},
|
|
{6, 8}, {194, 7}, {83, 7}, {101, 7}, {10, 8}, {119, 7}, {18, 8},
|
|
{34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8},
|
|
{79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {12, 8},
|
|
{121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
|
|
{70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6},
|
|
{48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
|
|
{209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4},
|
|
{64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5},
|
|
{162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5},
|
|
{73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6},
|
|
{143, 11}, {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9},
|
|
{68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6},
|
|
{111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6},
|
|
{82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7},
|
|
{145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7},
|
|
{93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9},
|
|
{31, 11}, {47, 11}, {7, 9}, {194, 7}, {83, 7}, {55, 11}, {11, 9},
|
|
{119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
|
|
{148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
|
|
{59, 11}, {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12},
|
|
{157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7},
|
|
{193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8},
|
|
{149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8},
|
|
{133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8},
|
|
{67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
|
|
{174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8},
|
|
{86, 8}, {61, 11}, {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8},
|
|
{209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9},
|
|
{5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8},
|
|
{33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
|
|
{152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7},
|
|
{124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8},
|
|
{71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7},
|
|
{52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12},
|
|
{209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6},
|
|
{197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8},
|
|
{2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8},
|
|
{40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6},
|
|
{16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12},
|
|
{146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12},
|
|
{209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12},
|
|
{153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4},
|
|
{209, 12}, {209, 12}, {176, 10}, {148, 6}, {188, 10}, {151, 6}, {163, 6},
|
|
{66, 6}, {200, 10}, {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6},
|
|
{92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6},
|
|
{76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5},
|
|
{118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {191, 10}, {152, 7}, {164, 7}, {145, 3}, {203, 10}, {90, 10},
|
|
{108, 10}, {69, 7}, {126, 10}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
|
|
{158, 7}, {114, 10}, {71, 7}, {132, 10}, {77, 7}, {95, 7}, {65, 5},
|
|
{194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5},
|
|
{1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {138, 10}, {79, 7},
|
|
{97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7},
|
|
{74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
|
|
{127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6},
|
|
{8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {206, 10},
|
|
{156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4},
|
|
{209, 12}, {159, 8}, {116, 10}, {72, 8}, {134, 10}, {78, 8}, {96, 8},
|
|
{65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5},
|
|
{91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {140, 10},
|
|
{80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {62, 11}, {15, 10},
|
|
{122, 8}, {23, 10}, {39, 10}, {3, 8}, {209, 12}, {157, 6}, {110, 8},
|
|
{70, 6}, {128, 8}, {27, 10}, {43, 10}, {5, 8}, {193, 6}, {82, 6},
|
|
{51, 10}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3},
|
|
{201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7},
|
|
{64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {29, 10},
|
|
{45, 10}, {6, 8}, {194, 7}, {83, 7}, {53, 10}, {10, 8}, {119, 7},
|
|
{18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
|
|
{136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {57, 10},
|
|
{12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6},
|
|
{109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6},
|
|
{82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4},
|
|
{161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9},
|
|
{150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5},
|
|
{177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9},
|
|
{148, 6}, {142, 10}, {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9},
|
|
{105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12},
|
|
{157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5},
|
|
{193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7},
|
|
{164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9},
|
|
{75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7},
|
|
{131, 9}, {30, 10}, {46, 10}, {7, 9}, {194, 7}, {83, 7}, {54, 10},
|
|
{11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12},
|
|
{173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
|
|
{85, 7}, {58, 10}, {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7},
|
|
{209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9},
|
|
{4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7},
|
|
{32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4},
|
|
{180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9},
|
|
{72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8},
|
|
{102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
|
|
{209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6},
|
|
{198, 8}, {86, 8}, {60, 10}, {14, 9}, {122, 8}, {22, 9}, {38, 9},
|
|
{3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9},
|
|
{42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6},
|
|
{17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8},
|
|
{69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
|
|
{112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7},
|
|
{83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7},
|
|
{209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7},
|
|
{66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8},
|
|
{36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
|
|
{24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7},
|
|
{118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12},
|
|
{209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
|
|
{209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5},
|
|
{209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5},
|
|
{64, 4}, {209, 12}, {209, 12}, {209, 12}, {148, 6}, {209, 12}, {151, 6},
|
|
{163, 6}, {66, 6}, {209, 12}, {154, 6}, {166, 6}, {68, 6}, {178, 6},
|
|
{74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6},
|
|
{181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6},
|
|
{67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {152, 7}, {164, 7}, {145, 3}, {209, 12},
|
|
{155, 7}, {167, 7}, {69, 7}, {179, 7}, {75, 7}, {93, 7}, {64, 4},
|
|
{209, 12}, {158, 7}, {170, 7}, {71, 7}, {182, 7}, {77, 7}, {95, 7},
|
|
{65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5},
|
|
{91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {185, 7},
|
|
{79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6},
|
|
{121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
|
|
{70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6},
|
|
{100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
|
|
{208, 12}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4},
|
|
{64, 4}, {209, 12}, {159, 8}, {171, 8}, {72, 8}, {183, 8}, {78, 8},
|
|
{96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8},
|
|
{73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6},
|
|
{186, 8}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8},
|
|
{68, 6}, {122, 8}, {74, 6}, {92, 6}, {3, 8}, {209, 12}, {157, 6},
|
|
{110, 8}, {70, 6}, {128, 8}, {76, 6}, {94, 6}, {5, 8}, {193, 6},
|
|
{82, 6}, {100, 6}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7},
|
|
{145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7},
|
|
{93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8},
|
|
{77, 7}, {95, 7}, {6, 8}, {194, 7}, {83, 7}, {101, 7}, {10, 8},
|
|
{119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
|
|
{148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
|
|
{103, 7}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12},
|
|
{157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7},
|
|
{193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12},
|
|
{149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5},
|
|
{184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5},
|
|
{67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
|
|
{175, 9}, {148, 6}, {144, 12}, {81, 9}, {99, 9}, {66, 6}, {199, 9},
|
|
{87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4},
|
|
{209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6},
|
|
{65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5},
|
|
{91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},
|
|
{152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7},
|
|
{125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9},
|
|
{71, 7}, {131, 9}, {77, 7}, {95, 7}, {7, 9}, {194, 7}, {83, 7},
|
|
{101, 7}, {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12},
|
|
{209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6},
|
|
{197, 7}, {85, 7}, {103, 7}, {13, 9}, {121, 7}, {21, 9}, {37, 9},
|
|
{2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9},
|
|
{41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6},
|
|
{16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8},
|
|
{146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8},
|
|
{115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8},
|
|
{84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4},
|
|
{209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8},
|
|
{66, 6}, {198, 8}, {86, 8}, {104, 8}, {14, 9}, {122, 8}, {22, 9},
|
|
{38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8},
|
|
{26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8},
|
|
{118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8},
|
|
{106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
|
|
{158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8},
|
|
{194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8},
|
|
{1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7},
|
|
{97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7},
|
|
{20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
|
|
{127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8},
|
|
{8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12},
|
|
{209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4},
|
|
{209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5},
|
|
{65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5},
|
|
{91, 5}, {64, 4}, {209, 12}, {209, 12}, {176, 10}, {148, 6}, {188, 10},
|
|
{151, 6}, {163, 6}, {66, 6}, {200, 10}, {154, 6}, {166, 6}, {68, 6},
|
|
{178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6},
|
|
{70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6},
|
|
{100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7}, {164, 7}, {145, 3},
|
|
{203, 10}, {90, 10}, {108, 10}, {69, 7}, {126, 10}, {75, 7}, {93, 7},
|
|
{64, 4}, {209, 12}, {158, 7}, {114, 10}, {71, 7}, {132, 10}, {77, 7},
|
|
{95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7},
|
|
{73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
|
|
{138, 10}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7},
|
|
{68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6},
|
|
{109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6},
|
|
{82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{145, 3}, {206, 10}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4},
|
|
{161, 4}, {64, 4}, {209, 12}, {159, 8}, {116, 10}, {72, 8}, {134, 10},
|
|
{78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5},
|
|
{120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8},
|
|
{148, 6}, {140, 10}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8},
|
|
{63, 12}, {15, 10}, {122, 8}, {23, 10}, {39, 10}, {3, 8}, {209, 12},
|
|
{157, 6}, {110, 8}, {70, 6}, {128, 8}, {27, 10}, {43, 10}, {5, 8},
|
|
{193, 6}, {82, 6}, {51, 10}, {9, 8}, {118, 6}, {17, 8}, {33, 8},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7},
|
|
{164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8},
|
|
{75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7},
|
|
{130, 8}, {29, 10}, {45, 10}, {6, 8}, {194, 7}, {83, 7}, {53, 10},
|
|
{10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12},
|
|
{173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
|
|
{85, 7}, {57, 10}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7},
|
|
{209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8},
|
|
{4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7},
|
|
{32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4},
|
|
{209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9},
|
|
{147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5},
|
|
{165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
|
|
{209, 12}, {175, 9}, {148, 6}, {142, 10}, {81, 9}, {99, 9}, {66, 6},
|
|
{199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6},
|
|
{64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6},
|
|
{94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6},
|
|
{73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9},
|
|
{69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
|
|
{113, 9}, {71, 7}, {131, 9}, {30, 10}, {46, 10}, {7, 9}, {194, 7},
|
|
{83, 7}, {54, 10}, {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7},
|
|
{209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7},
|
|
{66, 6}, {197, 7}, {85, 7}, {58, 10}, {13, 9}, {121, 7}, {21, 9},
|
|
{37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
|
|
{25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7},
|
|
{118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8},
|
|
{168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
|
|
{159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5},
|
|
{195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5},
|
|
{64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8},
|
|
{98, 8}, {66, 6}, {198, 8}, {86, 8}, {60, 10}, {14, 9}, {122, 8},
|
|
{22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6},
|
|
{128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9},
|
|
{9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8},
|
|
{88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4},
|
|
{209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9},
|
|
{6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8},
|
|
{34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8},
|
|
{79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8},
|
|
{121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
|
|
{70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6},
|
|
{48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
|
|
{209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4},
|
|
{64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5},
|
|
{162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5},
|
|
{73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {148, 6},
|
|
{209, 12}, {151, 6}, {163, 6}, {66, 6}, {209, 12}, {154, 6}, {166, 6},
|
|
{68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6},
|
|
{169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6},
|
|
{82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7}, {164, 7},
|
|
{145, 3}, {204, 11}, {155, 7}, {167, 7}, {69, 7}, {179, 7}, {75, 7},
|
|
{93, 7}, {64, 4}, {209, 12}, {158, 7}, {170, 7}, {71, 7}, {182, 7},
|
|
{77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5},
|
|
{119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
|
|
{148, 6}, {185, 7}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
|
|
{103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12},
|
|
{157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7},
|
|
{193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {145, 3}, {207, 11}, {156, 8}, {168, 8}, {146, 4}, {180, 8},
|
|
{149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {117, 11}, {72, 8},
|
|
{135, 11}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8},
|
|
{67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
|
|
{174, 8}, {148, 6}, {141, 11}, {80, 8}, {98, 8}, {66, 6}, {198, 8},
|
|
{86, 8}, {104, 8}, {68, 6}, {122, 8}, {74, 6}, {92, 6}, {3, 8},
|
|
{209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {76, 6}, {94, 6},
|
|
{5, 8}, {193, 6}, {82, 6}, {100, 6}, {9, 8}, {118, 6}, {17, 8},
|
|
{33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
|
|
{152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7},
|
|
{124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8},
|
|
{71, 7}, {130, 8}, {77, 7}, {95, 7}, {6, 8}, {194, 7}, {83, 7},
|
|
{101, 7}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12},
|
|
{209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6},
|
|
{197, 7}, {85, 7}, {103, 7}, {12, 8}, {121, 7}, {20, 8}, {36, 8},
|
|
{2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8},
|
|
{40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6},
|
|
{16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12},
|
|
{146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9},
|
|
{172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9},
|
|
{153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4},
|
|
{209, 12}, {209, 12}, {175, 9}, {148, 6}, {143, 11}, {81, 9}, {99, 9},
|
|
{66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6},
|
|
{92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9},
|
|
{76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5},
|
|
{118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9},
|
|
{107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
|
|
{158, 7}, {113, 9}, {71, 7}, {131, 9}, {31, 11}, {47, 11}, {7, 9},
|
|
{194, 7}, {83, 7}, {55, 11}, {11, 9}, {119, 7}, {19, 9}, {35, 9},
|
|
{1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7},
|
|
{97, 7}, {66, 6}, {197, 7}, {85, 7}, {59, 11}, {13, 9}, {121, 7},
|
|
{21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
|
|
{127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9},
|
|
{8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9},
|
|
{156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4},
|
|
{209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8},
|
|
{65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5},
|
|
{91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9},
|
|
{80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {61, 11}, {14, 9},
|
|
{122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8},
|
|
{70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6},
|
|
{50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3},
|
|
{201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7},
|
|
{64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9},
|
|
{44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7},
|
|
{18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
|
|
{136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9},
|
|
{12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6},
|
|
{109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6},
|
|
{82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4},
|
|
{161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12},
|
|
{150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5},
|
|
{177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {176, 10},
|
|
{148, 6}, {188, 10}, {151, 6}, {163, 6}, {66, 6}, {200, 10}, {154, 6},
|
|
{166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12},
|
|
{157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5},
|
|
{193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5},
|
|
{0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},
|
|
{164, 7}, {145, 3}, {203, 10}, {90, 10}, {108, 10}, {69, 7}, {126, 10},
|
|
{75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {114, 10}, {71, 7},
|
|
{132, 10}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7},
|
|
{67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12},
|
|
{173, 7}, {148, 6}, {138, 10}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
|
|
{85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7},
|
|
{209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6},
|
|
{4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7},
|
|
{32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {145, 3}, {206, 10}, {156, 8}, {168, 8}, {146, 4},
|
|
{180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {116, 10},
|
|
{72, 8}, {134, 10}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8},
|
|
{102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
|
|
{209, 12}, {174, 8}, {148, 6}, {140, 10}, {80, 8}, {98, 8}, {66, 6},
|
|
{198, 8}, {86, 8}, {62, 11}, {15, 10}, {122, 8}, {23, 10}, {39, 10},
|
|
{3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {27, 10},
|
|
{43, 10}, {5, 8}, {193, 6}, {82, 6}, {51, 10}, {9, 8}, {118, 6},
|
|
{17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
|
|
{189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8},
|
|
{69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
|
|
{112, 8}, {71, 7}, {130, 8}, {29, 10}, {45, 10}, {6, 8}, {194, 7},
|
|
{83, 7}, {53, 10}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7},
|
|
{209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7},
|
|
{66, 6}, {197, 7}, {85, 7}, {57, 10}, {12, 8}, {121, 7}, {20, 8},
|
|
{36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
|
|
{24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7},
|
|
{118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12},
|
|
{209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
|
|
{160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5},
|
|
{196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5},
|
|
{64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {142, 10}, {81, 9},
|
|
{99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9},
|
|
{74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6},
|
|
{129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6},
|
|
{67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12},
|
|
{209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9},
|
|
{89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4},
|
|
{209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {30, 10}, {46, 10},
|
|
{7, 9}, {194, 7}, {83, 7}, {54, 10}, {11, 9}, {119, 7}, {19, 9},
|
|
{35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9},
|
|
{79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {58, 10}, {13, 9},
|
|
{121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
|
|
{70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6},
|
|
{49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
|
|
{205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4},
|
|
{64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8},
|
|
{96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8},
|
|
{73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6},
|
|
{139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {60, 10},
|
|
{14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6},
|
|
{110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6},
|
|
{82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6},
|
|
{209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7},
|
|
{145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7},
|
|
{93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8},
|
|
{28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8},
|
|
{119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
|
|
{148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
|
|
{56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12},
|
|
{157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7},
|
|
{193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
|
|
{0, 6}};
|
|
} // namespace utf8_to_utf16
|
|
} // namespace tables
|
|
} // unnamed namespace
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
|
|
/* end file src/tables/utf8_to_utf16_tables.h */
|
|
/* begin file src/tables/utf16_to_utf8_tables.h */
|
|
// file generated by scripts/sse_convert_utf16_to_utf8.py
|
|
#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
|
|
namespace simdutf {
|
|
namespace {
|
|
namespace tables {
|
|
namespace utf16_to_utf8 {
|
|
|
|
// 1 byte for length, 16 bytes for mask
|
|
const uint8_t pack_1_2_utf8_bytes[256][17] = {
|
|
{16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
|
|
{15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
|
|
{15, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80},
|
|
{14, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{15, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
|
|
{14, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{14, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80},
|
|
{14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{15, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
|
|
{14, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80},
|
|
{14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{15, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
|
|
{14, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80},
|
|
{14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
|
|
{13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
|
|
{13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
|
|
{12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80}};
|
|
|
|
// 1 byte for length, 16 bytes for mask
|
|
const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
|
|
{12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80}};
|
|
|
|
} // namespace utf16_to_utf8
|
|
} // namespace tables
|
|
} // unnamed namespace
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
/* end file src/tables/utf16_to_utf8_tables.h */
|
|
/* begin file src/tables/utf32_to_utf16_tables.h */
|
|
// file generated by scripts/sse_convert_utf32_to_utf16.py
|
|
#ifndef SIMDUTF_UTF32_TO_UTF16_TABLES_H
|
|
#define SIMDUTF_UTF32_TO_UTF16_TABLES_H
|
|
|
|
namespace simdutf {
|
|
namespace {
|
|
namespace tables {
|
|
namespace utf32_to_utf16 {
|
|
|
|
const uint8_t pack_utf32_to_utf16le[16][16] = {
|
|
{0, 1, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80},
|
|
{0, 1, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80},
|
|
{0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80},
|
|
{0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
};
|
|
|
|
const uint8_t pack_utf32_to_utf16be[16][16] = {
|
|
{1, 0, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80},
|
|
{1, 0, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80},
|
|
{1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
|
|
};
|
|
|
|
} // namespace utf32_to_utf16
|
|
} // namespace tables
|
|
} // unnamed namespace
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
/* end file src/tables/utf32_to_utf16_tables.h */
|
|
// End of tables.
|
|
|
|
// Implementations: they need to be setup before including
|
|
// scalar/* code, as the scalar code is sometimes enabled
|
|
// only for peculiar build targets.
|
|
|
|
// The best choice should always come first!
|
|
/* begin file src/simdutf/arm64.h */
|
|
#ifndef SIMDUTF_ARM64_H
|
|
#define SIMDUTF_ARM64_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "arm64.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
#ifndef SIMDUTF_IMPLEMENTATION_ARM64
|
|
#define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 1
|
|
#else
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 0
|
|
#endif
|
|
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for NEON (ARMv8).
|
|
*/
|
|
namespace arm64 {} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/arm64/implementation.h */
|
|
#ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
|
|
#define SIMDUTF_ARM64_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("arm64", "ARM NEON",
|
|
internal::instruction_set::NEON) {}
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
};
|
|
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ARM64_IMPLEMENTATION_H
|
|
/* end file src/simdutf/arm64/implementation.h */
|
|
|
|
/* begin file src/simdutf/arm64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "arm64"
|
|
// #define SIMDUTF_IMPLEMENTATION arm64
|
|
#define SIMDUTF_SIMD_HAS_BYTEMASK 1
|
|
/* end file src/simdutf/arm64/begin.h */
|
|
|
|
// Declarations
|
|
/* begin file src/simdutf/arm64/intrinsics.h */
|
|
#ifndef SIMDUTF_ARM64_INTRINSICS_H
|
|
#define SIMDUTF_ARM64_INTRINSICS_H
|
|
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <arm_neon.h>
|
|
|
|
#endif // SIMDUTF_ARM64_INTRINSICS_H
|
|
/* end file src/simdutf/arm64/intrinsics.h */
|
|
/* begin file src/simdutf/arm64/bitmanipulation.h */
|
|
#ifndef SIMDUTF_ARM64_BITMANIPULATION_H
|
|
#define SIMDUTF_ARM64_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
|
|
}
|
|
|
|
#if SIMDUTF_NEED_TRAILING_ZEROES
|
|
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
}
|
|
#endif
|
|
template <typename T> T clear_least_significant_bit(T x) {
|
|
return (x & (x - 1));
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ARM64_BITMANIPULATION_H
|
|
/* end file src/simdutf/arm64/bitmanipulation.h */
|
|
/* begin file src/simdutf/arm64/simd.h */
|
|
#ifndef SIMDUTF_ARM64_SIMD_H
|
|
#define SIMDUTF_ARM64_SIMD_H
|
|
|
|
#include <type_traits>
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
namespace {
|
|
// Start of private section with Visual Studio workaround
|
|
|
|
#ifndef simdutf_make_uint8x16_t
|
|
#define simdutf_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, \
|
|
x11, x12, x13, x14, x15, x16) \
|
|
([=]() { \
|
|
uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
|
|
x9, x10, x11, x12, x13, x14, x15, x16}; \
|
|
return vld1q_u8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdutf_make_int8x16_t
|
|
#define simdutf_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, \
|
|
x11, x12, x13, x14, x15, x16) \
|
|
([=]() { \
|
|
int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
|
|
x9, x10, x11, x12, x13, x14, x15, x16}; \
|
|
return vld1q_s8(array); \
|
|
}())
|
|
#endif
|
|
|
|
#ifndef simdutf_make_uint8x8_t
|
|
#define simdutf_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1_u8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdutf_make_int8x8_t
|
|
#define simdutf_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1_s8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdutf_make_uint16x8_t
|
|
#define simdutf_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1q_u16(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdutf_make_int16x8_t
|
|
#define simdutf_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1q_s16(array); \
|
|
}())
|
|
#endif
|
|
|
|
// End of private section with Visual Studio workaround
|
|
} // namespace
|
|
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
|
|
template <typename T> struct simd8;
|
|
|
|
//
|
|
// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t
|
|
// internally.
|
|
//
|
|
template <typename T, typename Mask = simd8<bool>> struct base_u8 {
|
|
uint8x16_t value;
|
|
static const int SIZE = sizeof(value);
|
|
void dump() const {
|
|
uint8_t temp[16];
|
|
vst1q_u8(temp, *this);
|
|
printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x,%04x, %04x, %04x, "
|
|
"%04x, %04x, %04x, %04x, %04x]\n",
|
|
temp[0], temp[1], temp[2], temp[3], temp[4], temp[5], temp[6],
|
|
temp[7], temp[8], temp[9], temp[10], temp[11], temp[12], temp[13],
|
|
temp[14], temp[15]);
|
|
}
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
|
|
simdutf_really_inline operator const uint8x16_t &() const {
|
|
return this->value;
|
|
}
|
|
|
|
// Bit operations
|
|
simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
|
|
return vorrq_u8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
|
|
return vandq_u8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
|
|
return veorq_u8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
|
|
auto this_cast = static_cast<simd8<T> *>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
|
|
const simd8<T> rhs) {
|
|
return vceqq_u8(lhs, rhs);
|
|
}
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return vextq_u8(prev_chunk, *this, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base_u8<bool> {
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) {
|
|
return vmovq_n_u8(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd8(const uint8x16_t _value)
|
|
: base_u8<bool>(_value) {}
|
|
// False constructor
|
|
simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
|
|
simdutf_really_inline void store(uint8_t dst[16]) const {
|
|
return vst1q_u8(dst, *this);
|
|
}
|
|
|
|
// We return uint32_t instead of uint16_t because that seems to be more
|
|
// efficient for most purposes (cutting it down to uint16_t costs performance
|
|
// in some compilers).
|
|
simdutf_really_inline uint32_t to_bitmask() const {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask =
|
|
simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
auto minput = *this & bit_mask;
|
|
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
|
|
}
|
|
|
|
// Returns 4-bit out of each byte, alternating between the high 4 bits and low
|
|
// bits result it is 64 bit. This method is expected to be faster than none()
|
|
// and is equivalent when the vector register is the result of a comparison,
|
|
// with byte values 0xff and 0x00.
|
|
simdutf_really_inline uint64_t to_bitmask64() const {
|
|
return vget_lane_u64(
|
|
vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base_u8<uint8_t> {
|
|
static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
|
|
return vmovq_n_u8(_value);
|
|
}
|
|
static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
|
|
static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
|
|
return vld1q_u8(values);
|
|
}
|
|
simdutf_really_inline simd8(const uint8x16_t _value)
|
|
: base_u8<uint8_t>(_value) {}
|
|
// Zero constructor
|
|
simdutf_really_inline simd8() : simd8(zero()) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8(simdutf_make_uint8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
|
|
v10, v11, v12, v13, v14, v15)) {}
|
|
#else
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8(uint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15}) {}
|
|
#endif
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<uint8_t>
|
|
repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
|
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
|
|
uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
|
|
uint8_t v15) {
|
|
return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(uint8_t dst[16]) const {
|
|
return vst1q_u8(dst, *this);
|
|
}
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd8<uint8_t>
|
|
operator-(const simd8<uint8_t> other) const {
|
|
return vsubq_u8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> &operator-=(const simd8<uint8_t> other) {
|
|
*this = *this - other;
|
|
return *this;
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
|
|
simdutf_really_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return vcgeq_u8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator>(const simd8<uint8_t> other) const {
|
|
return vcgtq_u8(*this, other);
|
|
}
|
|
// Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
|
|
// = nonzero. For ARM, returns all 1's.
|
|
simdutf_really_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return simd8<uint8_t>(*this > other);
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
|
|
return vtstq_u8(*this, bits);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->max_val() < 0b10000000u;
|
|
}
|
|
|
|
simdutf_really_inline bool any_bits_set_anywhere() const {
|
|
return this->max_val() != 0;
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
|
|
return vshrq_n_u8(*this, N);
|
|
}
|
|
simdutf_really_inline uint16_t sum_bytes() const { return vaddvq_u8(*this); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd8<uint8_t>
|
|
apply_lookup_16_to(const simd8<T> original) const {
|
|
return vqtbl1q_u8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> {
|
|
int8x16_t value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
|
|
return vmovq_n_s8(_value);
|
|
}
|
|
static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
|
|
static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
|
|
return vld1q_s8(values);
|
|
}
|
|
|
|
// Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a
|
|
// USHLL #0, and shifting in NEON is actually quite slow.
|
|
//
|
|
// While this needs the registers to be in a specific order, bigger cores can
|
|
// interleave these with no overhead, and it still performs decently on little
|
|
// cores.
|
|
// movi v1.3d, #0
|
|
// mov v0.16b, value[0]
|
|
// st2 {v0.16b, v1.16b}, [ptr], #32
|
|
// mov v0.16b, value[1]
|
|
// st2 {v0.16b, v1.16b}, [ptr], #32
|
|
// ...
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
|
|
int8x16x2_t pair = match_system(big_endian)
|
|
? int8x16x2_t{{this->value, vmovq_n_s8(0)}}
|
|
: int8x16x2_t{{vmovq_n_s8(0), this->value}};
|
|
vst2q_s8(reinterpret_cast<int8_t *>(p), pair);
|
|
}
|
|
|
|
// In places where the table can be reused, which is most uses in simdutf, it
|
|
// is worth it to do 4 table lookups, as there is no direct zero extension
|
|
// from u8 to u32.
|
|
simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
|
|
const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
|
|
2, 255, 255, 255, 3, 255, 255, 255};
|
|
const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
|
|
6, 255, 255, 255, 7, 255, 255, 255};
|
|
const simd8<uint8_t> tb3{8, 255, 255, 255, 9, 255, 255, 255,
|
|
10, 255, 255, 255, 11, 255, 255, 255};
|
|
const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
|
|
14, 255, 255, 255, 15, 255, 255, 255};
|
|
|
|
// encourage store pairing and interleaving
|
|
const auto shuf1 = this->apply_lookup_16_to(tb1);
|
|
const auto shuf2 = this->apply_lookup_16_to(tb2);
|
|
shuf1.store(reinterpret_cast<int8_t *>(p));
|
|
shuf2.store(reinterpret_cast<int8_t *>(p + 4));
|
|
|
|
const auto shuf3 = this->apply_lookup_16_to(tb3);
|
|
const auto shuf4 = this->apply_lookup_16_to(tb4);
|
|
shuf3.store(reinterpret_cast<int8_t *>(p + 8));
|
|
shuf4.store(reinterpret_cast<int8_t *>(p + 12));
|
|
}
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
|
|
simdutf_really_inline operator const int8x16_t &() const {
|
|
return this->value;
|
|
}
|
|
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline operator const uint8x16_t() const {
|
|
return vreinterpretq_u8_s8(this->value);
|
|
}
|
|
#endif
|
|
simdutf_really_inline operator int8x16_t &() { return this->value; }
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline simd8() : simd8(zero()) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
|
|
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
|
|
: simd8(simdutf_make_int8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
|
|
v10, v11, v12, v13, v14, v15)) {}
|
|
#else
|
|
simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
|
|
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
|
|
: simd8(int8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15}) {}
|
|
#endif
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(int8_t dst[16]) const {
|
|
return vst1q_s8(dst, value);
|
|
}
|
|
// Explicit conversion to/from unsigned
|
|
//
|
|
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same
|
|
// type. In theory, we could check this occurrence with std::same_as and
|
|
// std::enabled_if but it is C++14 and relatively ugly and hard to read.
|
|
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline explicit simd8(const uint8x16_t other)
|
|
: simd8(vreinterpretq_s8_u8(other)) {}
|
|
#endif
|
|
simdutf_really_inline operator simd8<uint8_t>() const {
|
|
return vreinterpretq_u8_s8(this->value);
|
|
}
|
|
|
|
simdutf_really_inline simd8<int8_t>
|
|
operator|(const simd8<int8_t> other) const {
|
|
return vorrq_s8(value, other.value);
|
|
}
|
|
|
|
simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
|
|
simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
|
|
simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }
|
|
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
|
|
return vcgtq_s8(value, other.value);
|
|
}
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
|
|
return vcltq_s8(value, other.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd8<int8_t>
|
|
apply_lookup_16_to(const simd8<T> original) const {
|
|
return vqtbl1q_s8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"ARM kernel should use four registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
|
|
const simd8<T> chunk2, const simd8<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd8x64(const T *ptr)
|
|
: chunks{simd8<T>::load(ptr),
|
|
simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
this->chunks[2] |= other.chunks[2];
|
|
this->chunks[3] |= other.chunks[3];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 1);
|
|
this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 2);
|
|
this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
|
|
this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
|
|
this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask =
|
|
simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
// Add each of the elements next to each other, successively, to stuff each
|
|
// 8 byte mask into one.
|
|
uint8x16_t sum0 =
|
|
vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask),
|
|
vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
|
|
uint8x16_t sum1 =
|
|
vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask),
|
|
vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
|
|
this->chunks[2] < mask, this->chunks[3] < mask)
|
|
.to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
|
|
this->chunks[2] > mask, this->chunks[3] > mask)
|
|
.to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
|
|
simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
|
|
simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
|
|
simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask)
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
/* begin file src/simdutf/arm64/simd16-inl.h */
|
|
template <typename T> struct simd16;
|
|
|
|
template <typename T, typename Mask = simd16<bool>> struct base_u16 {
|
|
uint16x8_t value;
|
|
/// the size of vector in bytes
|
|
static const int SIZE = sizeof(value);
|
|
/// the number of elements of type T a vector can hold
|
|
static const int ELEMENTS = SIZE / sizeof(T);
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline base_u16() = default;
|
|
simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
|
|
simdutf_really_inline operator const uint16x8_t &() const {
|
|
return this->value;
|
|
}
|
|
simdutf_really_inline operator uint16x8_t &() { return this->value; }
|
|
// Bit operations
|
|
simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
|
|
return vorrq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
|
|
return vandq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> operator^(const simd16<T> other) const {
|
|
return veorq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const {
|
|
return vbicq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
simdutf_really_inline simd16<T> &operator|=(const simd16<T> other) {
|
|
auto this_cast = static_cast<simd16<T> *>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
simdutf_really_inline simd16<T> &operator&=(const simd16<T> other) {
|
|
auto this_cast = static_cast<simd16<T> *>(this);
|
|
*this_cast = *this_cast & other;
|
|
return *this_cast;
|
|
}
|
|
simdutf_really_inline simd16<T> &operator^=(const simd16<T> other) {
|
|
auto this_cast = static_cast<simd16<T> *>(this);
|
|
*this_cast = *this_cast ^ other;
|
|
return *this_cast;
|
|
}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
|
|
const simd16<T> rhs) {
|
|
return vceqq_u16(lhs, rhs);
|
|
}
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return vextq_u18(prev_chunk, *this, 8 - N);
|
|
}
|
|
};
|
|
|
|
template <typename T, typename Mask = simd16<bool>>
|
|
struct base16 : base_u16<T> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdutf_really_inline base16() : base_u16<T>() {}
|
|
simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
|
|
template <typename Pointer>
|
|
simdutf_really_inline base16(const Pointer *ptr) : base16(vld1q_u16(ptr)) {}
|
|
|
|
static const int SIZE = sizeof(base_u16<T>::value);
|
|
void dump() const {
|
|
uint16_t temp[8];
|
|
vst1q_u16(temp, *this);
|
|
printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0],
|
|
temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]);
|
|
}
|
|
template <int N = 1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return vextq_u18(prev_chunk, *this, 8 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd16<bool> : base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) {
|
|
return vmovq_n_u16(uint16_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd16() : base16() {}
|
|
simdutf_really_inline simd16(const uint16x8_t _value)
|
|
: base16<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
|
|
};
|
|
|
|
template <typename T> struct base16_numeric : base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) {
|
|
return vmovq_n_u16(_value);
|
|
}
|
|
static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
|
|
static simdutf_really_inline simd16<T> load(const T values[8]) {
|
|
return vld1q_u16(reinterpret_cast<const uint16_t *>(values));
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
simdutf_really_inline base16_numeric(const uint16x8_t _value)
|
|
: base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const {
|
|
return vst1q_u16(dst, *this);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
|
|
return vaddq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
|
|
return vsubq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
|
|
*this = *this + other;
|
|
return *static_cast<simd16<T> *>(this);
|
|
}
|
|
simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
|
|
*this = *this - other;
|
|
return *static_cast<simd16<T> *>(this);
|
|
}
|
|
};
|
|
|
|
// Signed code units
|
|
template <> struct simd16<int16_t> : base16_numeric<int16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
|
|
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline simd16(const uint16x8_t _value)
|
|
: base16_numeric<int16_t>(_value) {}
|
|
#endif
|
|
simdutf_really_inline simd16(const int16x8_t _value)
|
|
: base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t *values)
|
|
: simd16(load(reinterpret_cast<const int16_t *>(values))) {}
|
|
simdutf_really_inline operator simd16<uint16_t>() const;
|
|
simdutf_really_inline operator const uint16x8_t &() const {
|
|
return this->value;
|
|
}
|
|
simdutf_really_inline operator const int16x8_t() const {
|
|
return vreinterpretq_s16_u16(this->value);
|
|
}
|
|
|
|
simdutf_really_inline int16_t max_val() const {
|
|
return vmaxvq_s16(vreinterpretq_s16_u16(this->value));
|
|
}
|
|
simdutf_really_inline int16_t min_val() const {
|
|
return vminvq_s16(vreinterpretq_s16_u16(this->value));
|
|
}
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd16<int16_t>
|
|
max_val(const simd16<int16_t> other) const {
|
|
return vmaxq_s16(vreinterpretq_s16_u16(this->value),
|
|
vreinterpretq_s16_u16(other.value));
|
|
}
|
|
simdutf_really_inline simd16<int16_t>
|
|
min_val(const simd16<int16_t> other) const {
|
|
return vmaxq_s16(vreinterpretq_s16_u16(this->value),
|
|
vreinterpretq_s16_u16(other.value));
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator>(const simd16<int16_t> other) const {
|
|
return vcgtq_s16(vreinterpretq_s16_u16(this->value),
|
|
vreinterpretq_s16_u16(other.value));
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator<(const simd16<int16_t> other) const {
|
|
return vcltq_s16(vreinterpretq_s16_u16(this->value),
|
|
vreinterpretq_s16_u16(other.value));
|
|
}
|
|
};
|
|
|
|
// Unsigned code units
|
|
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
simdutf_really_inline simd16(const uint16x8_t _value)
|
|
: base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t *values)
|
|
: simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
|
|
|
|
simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
|
|
simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
|
|
// Saturated math
|
|
simdutf_really_inline simd16<uint16_t>
|
|
saturating_add(const simd16<uint16_t> other) const {
|
|
return vqaddq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<uint16_t>
|
|
saturating_sub(const simd16<uint16_t> other) const {
|
|
return vqsubq_u16(*this, other);
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16<uint16_t>
|
|
max_val(const simd16<uint16_t> other) const {
|
|
return vmaxq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<uint16_t>
|
|
min_val(const simd16<uint16_t> other) const {
|
|
return vminq_u16(*this, other);
|
|
}
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t>
|
|
gt_bits(const simd16<uint16_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t>
|
|
lt_bits(const simd16<uint16_t> other) const {
|
|
return other.saturating_sub(*this);
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator<=(const simd16<uint16_t> other) const {
|
|
return vcleq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator>=(const simd16<uint16_t> other) const {
|
|
return vcgeq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator>(const simd16<uint16_t> other) const {
|
|
return vcgtq_u16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator<(const simd16<uint16_t> other) const {
|
|
return vcltq_u16(*this, other);
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd16<bool> bits_not_set() const {
|
|
return *this == uint16_t(0);
|
|
}
|
|
template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
|
|
return simd16<uint16_t>(vshrq_n_u16(*this, N));
|
|
}
|
|
template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
|
|
return simd16<uint16_t>(vshlq_n_u16(*this, N));
|
|
}
|
|
|
|
// Pack with the unsigned saturation of two uint16_t code units into single
|
|
// uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
|
|
const simd16<uint16_t> &v1) {
|
|
return vqmovn_high_u16(vqmovn_u16(v0), v1);
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
return vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(*this)));
|
|
}
|
|
|
|
void dump() const {
|
|
uint16_t temp[8];
|
|
vst1q_u16(temp, *this);
|
|
printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0],
|
|
temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]);
|
|
}
|
|
|
|
simdutf_really_inline uint32_t sum() const { return vaddlvq_u16(value); }
|
|
};
|
|
|
|
simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
|
|
return this->value;
|
|
}
|
|
|
|
template <typename T> struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"ARM kernel should use four registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
|
|
simd16x32<T> &
|
|
operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline
|
|
simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
|
|
const simd16<T> chunk2, const simd16<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd16x32(const T *ptr)
|
|
: chunks{simd16<T>::load(ptr),
|
|
simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd16<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
|
|
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
|
|
this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
|
|
this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask =
|
|
simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
// Add each of the elements next to each other, successively, to stuff each
|
|
// 8 byte mask into one.
|
|
uint8x16_t sum0 = vpaddq_u8(
|
|
vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)),
|
|
vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
|
|
uint8x16_t sum1 = vpaddq_u8(
|
|
vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)),
|
|
vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
this->chunks[2] = this->chunks[2].swap_bytes();
|
|
this->chunks[3] = this->chunks[3].swap_bytes();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask, this->chunks[3] <= mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(low);
|
|
const simd16<T> mask_high = simd16<T>::splat(high);
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
|
|
(this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
|
|
(this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
|
|
(this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
template <>
|
|
simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
|
|
const uint16_t low, const uint16_t high) const {
|
|
const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
|
|
const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
|
|
simd16x32<uint16_t> x(simd16<uint16_t>((this->chunks[0] > mask_high) |
|
|
(this->chunks[0] < mask_low)),
|
|
simd16<uint16_t>((this->chunks[1] > mask_high) |
|
|
(this->chunks[1] < mask_low)),
|
|
simd16<uint16_t>((this->chunks[2] > mask_high) |
|
|
(this->chunks[2] < mask_low)),
|
|
simd16<uint16_t>((this->chunks[3] > mask_high) |
|
|
(this->chunks[3] < mask_low)));
|
|
return x.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> a,
|
|
simd16<uint16_t> b) {
|
|
return vminq_u16(a.value, b.value);
|
|
}
|
|
/* end file src/simdutf/arm64/simd16-inl.h */
|
|
/* begin file src/simdutf/arm64/simd32-inl.h */
|
|
template <typename T> struct simd32;
|
|
|
|
template <> struct simd32<uint32_t> {
|
|
static const size_t SIZE = sizeof(uint32x4_t);
|
|
static const size_t ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
uint32x4_t value;
|
|
|
|
simdutf_really_inline simd32(const uint32x4_t v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd32(const Pointer *ptr)
|
|
: value(vld1q_u32(reinterpret_cast<const uint32_t *>(ptr))) {}
|
|
|
|
simdutf_really_inline uint64_t sum() const { return vaddvq_u32(value); }
|
|
|
|
simdutf_really_inline simd32<uint32_t> swap_bytes() const {
|
|
return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(value)));
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
|
|
return vshrq_n_u32(value, N);
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
|
|
return vshlq_n_u32(value, N);
|
|
}
|
|
|
|
void dump() const {
|
|
uint32_t temp[4];
|
|
vst1q_u32(temp, value);
|
|
printf("[%08x, %08x, %08x, %08x]\n", temp[0], temp[1], temp[2], temp[3]);
|
|
}
|
|
|
|
// operators
|
|
simdutf_really_inline simd32 &operator+=(const simd32 other) {
|
|
value = vaddq_u32(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd32<uint32_t> zero() {
|
|
return vdupq_n_u32(0);
|
|
}
|
|
|
|
simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
|
|
return vdupq_n_u32(v);
|
|
}
|
|
};
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
template <> struct simd32<bool> {
|
|
uint32x4_t value;
|
|
|
|
simdutf_really_inline simd32(const uint32x4_t v) : value(v) {}
|
|
|
|
simdutf_really_inline bool any() const { return vmaxvq_u32(value) != 0; }
|
|
};
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd32<T> operator|(const simd32<T> a,
|
|
const simd32<T> b) {
|
|
return vorrq_u32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return vminq_u32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return vmaxq_u32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return vceqq_u32(a.value, vdupq_n_u32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return vandq_u32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return vandq_u32(a.value, vdupq_n_u32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator|(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return vorrq_u32(a.value, vdupq_n_u32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return vaddq_u32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator-(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return vsubq_u32(a.value, vdupq_n_u32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return vcgeq_u32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator!(const simd32<bool> v) {
|
|
return vmvnq_u32(v.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return vcgtq_u32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> select(const simd32<bool> cond,
|
|
const simd32<uint32_t> v_true,
|
|
const simd32<uint32_t> v_false) {
|
|
return vbslq_u32(cond.value, v_true.value, v_false.value);
|
|
}
|
|
/* end file src/simdutf/arm64/simd32-inl.h */
|
|
/* begin file src/simdutf/arm64/simd64-inl.h */
|
|
template <typename T> struct simd64;
|
|
|
|
template <> struct simd64<uint64_t> {
|
|
uint64x2_t value;
|
|
|
|
simdutf_really_inline simd64(const uint64x2_t v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd64(const Pointer *ptr)
|
|
: value(vld1q_u64(reinterpret_cast<const uint64_t *>(ptr))) {}
|
|
|
|
simdutf_really_inline uint64_t sum() const { return vaddvq_u64(value); }
|
|
|
|
// operators
|
|
simdutf_really_inline simd64 &operator+=(const simd64 other) {
|
|
value = vaddq_u64(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd64<uint64_t> zero() {
|
|
return vdupq_n_u64(0);
|
|
}
|
|
|
|
simdutf_really_inline static simd64<uint64_t> splat(uint64_t v) {
|
|
return vdupq_n_u64(v);
|
|
}
|
|
};
|
|
/* end file src/simdutf/arm64/simd64-inl.h */
|
|
|
|
simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
|
|
// We do it as 3 instructions. There might be a faster way.
|
|
// We hope that these 3 instructions are cheap.
|
|
uint16x8_t first_sum = vpaddlq_u8(v);
|
|
uint32x4_t second_sum = vpaddlq_u16(first_sum);
|
|
return vpaddlq_u32(second_sum);
|
|
}
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ARM64_SIMD_H
|
|
/* end file src/simdutf/arm64/simd.h */
|
|
|
|
/* begin file src/simdutf/arm64/end.h */
|
|
#undef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
/* end file src/simdutf/arm64/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_ARM64
|
|
|
|
#endif // SIMDUTF_ARM64_H
|
|
/* end file src/simdutf/arm64.h */
|
|
/* begin file src/simdutf/icelake.h */
|
|
#ifndef SIMDUTF_ICELAKE_H
|
|
#define SIMDUTF_ICELAKE_H
|
|
|
|
|
|
#ifdef __has_include
|
|
// How do we detect that a compiler supports vbmi2?
|
|
// For sure if the following header is found, we are ok?
|
|
#if __has_include(<avx512vbmi2intrin.h>)
|
|
#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
#if _MSC_VER >= 1930
|
|
// Visual Studio 2022 and up support VBMI2 under x64 even if the header
|
|
// avx512vbmi2intrin.h is not found.
|
|
// Visual Studio 2019 technically supports VBMI2, but the implementation
|
|
// might be unreliable. Search for visualstudio2019icelakeissue in our
|
|
// tests.
|
|
#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
|
|
#endif
|
|
#endif
|
|
|
|
// We allow icelake on x64 as long as the compiler is known to support VBMI2.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
#define SIMDUTF_IMPLEMENTATION_ICELAKE \
|
|
((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
|
|
#endif
|
|
|
|
// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
|
|
// https://github.com/simdutf/simdutf/issues/1247
|
|
#if ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && \
|
|
(SIMDUTF_HAS_AVX512F && SIMDUTF_HAS_AVX512DQ && SIMDUTF_HAS_AVX512VL && \
|
|
SIMDUTF_HAS_AVX512VBMI2) && \
|
|
(!SIMDUTF_IS_32BITS))
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 1
|
|
#else
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 0
|
|
#endif
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
#define SIMDUTF_TARGET_ICELAKE
|
|
#else
|
|
#define SIMDUTF_TARGET_ICELAKE \
|
|
SIMDUTF_TARGET_REGION( \
|
|
"avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2," \
|
|
"avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq")
|
|
#endif
|
|
|
|
namespace simdutf {
|
|
namespace icelake {} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
//
|
|
// These two need to be included outside SIMDUTF_TARGET_REGION
|
|
//
|
|
/* begin file src/simdutf/icelake/intrinsics.h */
|
|
#ifndef SIMDUTF_ICELAKE_INTRINSICS_H
|
|
#define SIMDUTF_ICELAKE_INTRINSICS_H
|
|
|
|
|
|
#ifdef SIMDUTF_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#include <immintrin.h>
|
|
#else
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// We should not get warnings while including <x86intrin.h> yet we do
|
|
// under some versions of GCC.
|
|
// If the x86intrin.h header has uninitialized values that are problematic,
|
|
// it is a GCC issue, we want to ignore these warnings.
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
|
|
#endif
|
|
|
|
#include <x86intrin.h> // elsewhere
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// cancels the suppression of the -Wuninitialized
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif
|
|
|
|
#ifndef _tzcnt_u64
|
|
#define _tzcnt_u64(x) __tzcnt_u64(x)
|
|
#endif // _tzcnt_u64
|
|
#endif // SIMDUTF_VISUAL_STUDIO
|
|
|
|
#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdutf, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <bmi2intrin.h> // for _pext_u64, _pdep_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
// Important: we need the AVX-512 headers:
|
|
#include <avx512fintrin.h>
|
|
#include <avx512dqintrin.h>
|
|
#include <avx512cdintrin.h>
|
|
#include <avx512bwintrin.h>
|
|
#include <avx512vlintrin.h>
|
|
#include <avx512vlbwintrin.h>
|
|
#include <avx512vbmiintrin.h>
|
|
#include <avx512vbmi2intrin.h>
|
|
#include <avx512vpopcntdqintrin.h>
|
|
#include <avx512vpopcntdqvlintrin.h>
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) ((n - 1) & n)
|
|
#endif // _blsr_u64
|
|
#endif // SIMDUTF_CLANG_VISUAL_STUDIO
|
|
|
|
#if defined(__GNUC__) && !defined(__clang__)
|
|
|
|
#if __GNUC__ == 8
|
|
#define SIMDUTF_GCC8 1
|
|
#elif __GNUC__ == 9
|
|
#define SIMDUTF_GCC9 1
|
|
#endif // __GNUC__ == 8 || __GNUC__ == 9
|
|
|
|
#endif // defined(__GNUC__) && !defined(__clang__)
|
|
|
|
#if SIMDUTF_GCC8
|
|
#pragma GCC push_options
|
|
#pragma GCC target("avx512f")
|
|
/**
|
|
* GCC 8 fails to provide _mm512_set_epi8. We roll our own.
|
|
*/
|
|
inline __m512i
|
|
_mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4,
|
|
uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9,
|
|
uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14,
|
|
uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19,
|
|
uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
|
|
uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29,
|
|
uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34,
|
|
uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39,
|
|
uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44,
|
|
uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49,
|
|
uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54,
|
|
uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59,
|
|
uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
|
|
return _mm512_set_epi64(
|
|
uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) +
|
|
(uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) +
|
|
(uint64_t(a1) << 48) + (uint64_t(a0) << 56),
|
|
uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) +
|
|
(uint64_t(a12) << 24) + (uint64_t(a11) << 32) +
|
|
(uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
|
|
uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) +
|
|
(uint64_t(a20) << 24) + (uint64_t(a19) << 32) +
|
|
(uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
|
|
uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) +
|
|
(uint64_t(a28) << 24) + (uint64_t(a27) << 32) +
|
|
(uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
|
|
uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) +
|
|
(uint64_t(a36) << 24) + (uint64_t(a35) << 32) +
|
|
(uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
|
|
uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) +
|
|
(uint64_t(a44) << 24) + (uint64_t(a43) << 32) +
|
|
(uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
|
|
uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) +
|
|
(uint64_t(a52) << 24) + (uint64_t(a51) << 32) +
|
|
(uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
|
|
uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) +
|
|
(uint64_t(a60) << 24) + (uint64_t(a59) << 32) +
|
|
(uint64_t(a58) << 40) + (uint64_t(a57) << 48) +
|
|
(uint64_t(a56) << 56));
|
|
}
|
|
#pragma GCC pop_options
|
|
#endif // SIMDUTF_GCC8
|
|
|
|
#endif // SIMDUTF_HASWELL_INTRINSICS_H
|
|
/* end file src/simdutf/icelake/intrinsics.h */
|
|
/* begin file src/simdutf/icelake/implementation.h */
|
|
#ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
|
|
#define SIMDUTF_ICELAKE_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation(
|
|
"icelake",
|
|
"Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 "
|
|
"extensions)",
|
|
internal::instruction_set::AVX2 | internal::instruction_set::BMI1 |
|
|
internal::instruction_set::BMI2 |
|
|
internal::instruction_set::AVX512BW |
|
|
internal::instruction_set::AVX512CD |
|
|
internal::instruction_set::AVX512VL |
|
|
internal::instruction_set::AVX512VBMI2 |
|
|
internal::instruction_set::AVX512VPOPCNTDQ) {}
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
};
|
|
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H
|
|
/* end file src/simdutf/icelake/implementation.h */
|
|
|
|
//
|
|
// The rest need to be inside the region
|
|
//
|
|
/* begin file src/simdutf/icelake/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "icelake"
|
|
// #define SIMDUTF_IMPLEMENTATION icelake
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_ICELAKE
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
// clang-format off
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
// clang-format on
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/begin.h */
|
|
// Declarations
|
|
/* begin file src/simdutf/icelake/bitmanipulation.h */
|
|
#ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
|
|
#define SIMDUTF_ICELAKE_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num); // Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
#if SIMDUTF_NEED_TRAILING_ZEROES
|
|
// simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
|
|
// #if SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
// return (int)_tzcnt_u64(input_num);
|
|
// #else // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
// return __builtin_ctzll(input_num);
|
|
// #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
// }
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
|
|
/* end file src/simdutf/icelake/bitmanipulation.h */
|
|
/* begin file src/simdutf/icelake/simd.h */
|
|
#ifndef SIMDUTF_ICELAKE_SIMD_H
|
|
#define SIMDUTF_ICELAKE_SIMD_H
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
/* begin file src/simdutf/icelake/simd16-inl.h */
|
|
template <typename T> struct simd16;
|
|
|
|
template <> struct simd16<uint16_t> {
|
|
static const size_t SIZE = sizeof(__m512i);
|
|
static const size_t ELEMENTS = SIZE / sizeof(uint16_t);
|
|
|
|
template <typename Pointer>
|
|
static simdutf_really_inline simd16<uint16_t> load(const Pointer *ptr) {
|
|
return simd16<uint16_t>(ptr);
|
|
}
|
|
|
|
__m512i value;
|
|
|
|
simdutf_really_inline simd16(const __m512i v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd16(const Pointer *ptr)
|
|
: value(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptr))) {}
|
|
|
|
// operators
|
|
simdutf_really_inline simd16 &operator+=(const simd16 other) {
|
|
value = _mm512_add_epi32(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd16 &operator-=(const simd16 other) {
|
|
value = _mm512_sub_epi32(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// methods
|
|
simdutf_really_inline simd16 swap_bytes() const {
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
|
|
return _mm512_shuffle_epi8(value, byteflip);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
const auto lo = _mm512_and_si512(value, _mm512_set1_epi32(0xffff));
|
|
const auto hi = _mm512_srli_epi32(value, 16);
|
|
const auto sum32 = _mm512_add_epi32(lo, hi);
|
|
|
|
return _mm512_reduce_add_epi32(sum32);
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd16<uint16_t> zero() {
|
|
return _mm512_setzero_si512();
|
|
}
|
|
|
|
simdutf_really_inline static simd16<uint16_t> splat(uint16_t v) {
|
|
return _mm512_set1_epi16(v);
|
|
}
|
|
};
|
|
|
|
template <> struct simd16<bool> {
|
|
__mmask32 value;
|
|
|
|
simdutf_really_inline simd16(const __mmask32 v) : value(v) {}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> b,
|
|
const simd16<uint16_t> a) {
|
|
return _mm512_min_epu16(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> a,
|
|
uint16_t b) {
|
|
return _mm512_and_si512(a.value, _mm512_set1_epi16(b));
|
|
}
|
|
|
|
simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
|
|
uint16_t b) {
|
|
return _mm512_xor_si512(a.value, _mm512_set1_epi16(b));
|
|
}
|
|
|
|
simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
|
|
const simd16<uint16_t> b) {
|
|
return _mm512_xor_si512(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool> operator==(const simd16<uint16_t> a,
|
|
uint16_t b) {
|
|
return _mm512_cmpeq_epi16_mask(a.value, _mm512_set1_epi16(b));
|
|
}
|
|
/* end file src/simdutf/icelake/simd16-inl.h */
|
|
/* begin file src/simdutf/icelake/simd32-inl.h */
|
|
template <typename T> struct simd32;
|
|
|
|
template <> struct simd32<uint32_t> {
|
|
static const size_t SIZE = sizeof(__m512i);
|
|
static const size_t ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
__m512i value;
|
|
|
|
simdutf_really_inline simd32(const __m512i v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd32(const Pointer *ptr)
|
|
: value(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptr))) {}
|
|
|
|
uint64_t sum() const {
|
|
const __m512i mask = _mm512_set1_epi64(0xffffffff);
|
|
const __m512i t0 = _mm512_and_si512(value, mask);
|
|
const __m512i t1 = _mm512_srli_epi64(value, 32);
|
|
const __m512i t2 = _mm512_add_epi64(t0, t1);
|
|
return _mm512_reduce_add_epi64(t2);
|
|
}
|
|
|
|
// operators
|
|
simdutf_really_inline simd32 &operator+=(const simd32 other) {
|
|
value = _mm512_add_epi32(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd32<uint32_t> zero() {
|
|
return _mm512_setzero_si512();
|
|
}
|
|
|
|
simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
|
|
return _mm512_set1_epi32(v);
|
|
}
|
|
};
|
|
|
|
simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> b,
|
|
const simd32<uint32_t> a) {
|
|
return _mm512_min_epu32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> b,
|
|
const simd32<uint32_t> a) {
|
|
return _mm512_and_si512(a.value, b.value);
|
|
}
|
|
/* end file src/simdutf/icelake/simd32-inl.h */
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ICELAKE_SIMD_H
|
|
/* end file src/simdutf/icelake/simd.h */
|
|
|
|
/* begin file src/simdutf/icelake/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
#endif // SIMDUTF_ICELAKE_H
|
|
/* end file src/simdutf/icelake.h */
|
|
/* begin file src/simdutf/haswell.h */
|
|
#ifndef SIMDUTF_HASWELL_H
|
|
#define SIMDUTF_HASWELL_H
|
|
|
|
#ifdef SIMDUTF_WESTMERE_H
|
|
#error "haswell.h must be included before westmere.h"
|
|
#endif
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "haswell.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
// Default Haswell to on if this is x86-64. Even if we are not compiled for it,
|
|
// it could be selected at runtime.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_HASWELL
|
|
//
|
|
// You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
|
|
// because we want to rely on *runtime dispatch*.
|
|
//
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
#define SIMDUTF_IMPLEMENTATION_HASWELL 0
|
|
#else
|
|
#define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
|
|
#endif
|
|
|
|
#endif
|
|
// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
|
|
// https://github.com/simdutf/simdutf/issues/1247
|
|
#if ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 1
|
|
#else
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 0
|
|
#endif
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
|
|
#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for Haswell (Intel AVX2).
|
|
*/
|
|
namespace haswell {} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
//
|
|
// These two need to be included outside SIMDUTF_TARGET_REGION
|
|
//
|
|
/* begin file src/simdutf/haswell/implementation.h */
|
|
#ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
|
|
#define SIMDUTF_HASWELL_IMPLEMENTATION_H
|
|
|
|
|
|
// The constructor may be executed on any host, so we take care not to use
|
|
// SIMDUTF_TARGET_REGION
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
|
|
using namespace simdutf;
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("haswell", "Intel/AMD AVX2",
|
|
internal::instruction_set::AVX2 |
|
|
internal::instruction_set::BMI1 |
|
|
internal::instruction_set::BMI2) {}
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
};
|
|
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
|
|
/* end file src/simdutf/haswell/implementation.h */
|
|
/* begin file src/simdutf/haswell/intrinsics.h */
|
|
#ifndef SIMDUTF_HASWELL_INTRINSICS_H
|
|
#define SIMDUTF_HASWELL_INTRINSICS_H
|
|
|
|
|
|
#ifdef SIMDUTF_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// We should not get warnings while including <x86intrin.h> yet we do
|
|
// under some versions of GCC.
|
|
// If the x86intrin.h header has uninitialized values that are problematic,
|
|
// it is a GCC issue, we want to ignore these warnings.
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
|
|
#endif
|
|
|
|
#include <x86intrin.h> // elsewhere
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// cancels the suppression of the -Wuninitialized
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif
|
|
|
|
#endif // SIMDUTF_VISUAL_STUDIO
|
|
|
|
#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdutf, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) (((n) - 1) & (n))
|
|
#endif // _blsr_u64
|
|
// Same issue with _blsmsk_u32:
|
|
#ifndef _blsmsk_u32
|
|
// we roll our own
|
|
#define _blsmsk_u32(n) (((n) - 1) ^ (n))
|
|
#endif // _blsmsk_u32
|
|
#endif // SIMDUTF_CLANG_VISUAL_STUDIO
|
|
|
|
#endif // SIMDUTF_HASWELL_INTRINSICS_H
|
|
/* end file src/simdutf/haswell/intrinsics.h */
|
|
|
|
//
|
|
// The rest need to be inside the region
|
|
//
|
|
/* begin file src/simdutf/haswell/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "haswell"
|
|
// #define SIMDUTF_IMPLEMENTATION haswell
|
|
#define SIMDUTF_SIMD_HAS_BYTEMASK 1
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_HASWELL
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
// clang-format off
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
// clang-format on
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/begin.h */
|
|
// Declarations
|
|
/* begin file src/simdutf/haswell/bitmanipulation.h */
|
|
#ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
|
|
#define SIMDUTF_HASWELL_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num); // Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
#if SIMDUTF_NEED_TRAILING_ZEROES
|
|
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
return (int)_tzcnt_u64(input_num);
|
|
#else // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
}
|
|
#endif
|
|
|
|
template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_HASWELL_BITMANIPULATION_H
|
|
/* end file src/simdutf/haswell/bitmanipulation.h */
|
|
/* begin file src/simdutf/haswell/simd.h */
|
|
#ifndef SIMDUTF_HASWELL_SIMD_H
|
|
#define SIMDUTF_HASWELL_SIMD_H
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template <typename Child> struct base {
|
|
__m256i value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base() : value{__m256i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base(const __m256i _value) : value(_value) {}
|
|
|
|
simdutf_really_inline operator const __m256i &() const { return this->value; }
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
__m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
|
|
__m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this, 1));
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
first = _mm256_shuffle_epi8(first, swap);
|
|
second = _mm256_shuffle_epi8(second, swap);
|
|
}
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
|
|
_mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 8),
|
|
_mm256_cvtepu8_epi32(_mm256_castsi256_si128(
|
|
_mm256_srli_si256(*this, 8))));
|
|
_mm256_storeu_si256(
|
|
reinterpret_cast<__m256i *>(ptr + 16),
|
|
_mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this, 1)));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24),
|
|
_mm256_cvtepu8_epi32(_mm_srli_si128(
|
|
_mm256_extractf128_si256(*this, 1), 8)));
|
|
}
|
|
// Bit operations
|
|
simdutf_really_inline Child operator|(const Child other) const {
|
|
return _mm256_or_si256(*this, other);
|
|
}
|
|
simdutf_really_inline Child operator&(const Child other) const {
|
|
return _mm256_and_si256(*this, other);
|
|
}
|
|
simdutf_really_inline Child operator^(const Child other) const {
|
|
return _mm256_xor_si256(*this, other);
|
|
}
|
|
simdutf_really_inline Child &operator|=(const Child other) {
|
|
auto this_cast = static_cast<Child *>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template <typename T> struct simd8;
|
|
|
|
template <typename T, typename Mask = simd8<bool>>
|
|
struct base8 : base<simd8<T>> {
|
|
simdutf_really_inline base8() : base<simd8<T>>() {}
|
|
|
|
simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdutf_always_inline Mask operator==(const simd8<T> lhs,
|
|
const simd8<T> rhs) {
|
|
return _mm256_cmpeq_epi8(lhs, rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<T>::value);
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm256_alignr_epi8(
|
|
*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base8<bool> {
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) {
|
|
return _mm256_set1_epi8(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}
|
|
|
|
simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline uint32_t to_bitmask() const {
|
|
return uint32_t(_mm256_movemask_epi8(value));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct base8_numeric : base8<T> {
|
|
static simdutf_really_inline simd8<T> splat(T _value) {
|
|
return _mm256_set1_epi8(_value);
|
|
}
|
|
static simdutf_really_inline simd8<T> zero() {
|
|
return _mm256_setzero_si256();
|
|
}
|
|
static simdutf_really_inline simd8<T> load(const T values[32]) {
|
|
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
|
|
T v5, T v6, T v7, T v8, T v9,
|
|
T v10, T v11, T v12, T v13,
|
|
T v14, T v15) {
|
|
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
|
|
v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15);
|
|
}
|
|
|
|
simdutf_really_inline base8_numeric() : base8<T>() {}
|
|
simdutf_really_inline base8_numeric(const __m256i _value)
|
|
: base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[32]) const {
|
|
return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this);
|
|
}
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
|
|
return _mm256_sub_epi8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
|
|
*this = *this - other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm256_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdutf_really_inline simd8(const __m256i _value)
|
|
: base8_numeric<int8_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
|
|
simdutf_really_inline operator simd8<uint8_t>() const;
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return _mm256_movemask_epi8(*this) == 0;
|
|
}
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
|
|
return _mm256_cmpgt_epi8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
|
|
return _mm256_cmpgt_epi8(other, *this);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdutf_really_inline simd8(const __m256i _value)
|
|
: base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
|
|
uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
|
|
uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
|
|
uint8_t v31)
|
|
: simd8(_mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
|
|
v22, v23, v24, v25, v26, v27, v28, v29, v30,
|
|
v31)) {}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t>
|
|
saturating_sub(const simd8<uint8_t> other) const {
|
|
return _mm256_subs_epu8(*this, other);
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd8<uint8_t>
|
|
min_val(const simd8<uint8_t> other) const {
|
|
return _mm256_min_epu8(other, *this);
|
|
}
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return _mm256_movemask_epi8(*this) == 0;
|
|
}
|
|
simdutf_really_inline bool bits_not_set_anywhere() const {
|
|
return _mm256_testz_si256(*this, *this);
|
|
}
|
|
|
|
simdutf_really_inline bool any_bits_set_anywhere() const {
|
|
return !bits_not_set_anywhere();
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
|
|
return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum_bytes() const {
|
|
const auto tmp = _mm256_sad_epu8(value, _mm256_setzero_si256());
|
|
|
|
return _mm256_extract_epi64(tmp, 0) + _mm256_extract_epi64(tmp, 1) +
|
|
_mm256_extract_epi64(tmp, 2) + _mm256_extract_epi64(tmp, 3);
|
|
}
|
|
};
|
|
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
|
|
return this->value;
|
|
}
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 2,
|
|
"Haswell kernel should use two registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
|
|
: chunks{chunk0, chunk1} {}
|
|
simdutf_really_inline simd8x64(const T *ptr)
|
|
: chunks{simd8<T>::load(ptr),
|
|
simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r_hi = this->chunks[1].to_bitmask();
|
|
return r_lo | (r_hi << 32);
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 1);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
|
|
(simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
/* begin file src/simdutf/haswell/simd16-inl.h */
|
|
#ifdef __GNUC__
|
|
#if __GNUC__ < 8
|
|
#define _mm256_set_m128i(xmm1, xmm2) \
|
|
_mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), \
|
|
_mm256_castsi128_si256(xmm2), 2)
|
|
#define _mm256_setr_m128i(xmm2, xmm1) \
|
|
_mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), \
|
|
_mm256_castsi128_si256(xmm2), 2)
|
|
#endif
|
|
#endif
|
|
|
|
template <typename T> struct simd16;
|
|
|
|
template <typename T, typename Mask = simd16<bool>>
|
|
struct base16 : base<simd16<T>> {
|
|
using bitmask_type = uint32_t;
|
|
|
|
simdutf_really_inline base16() : base<simd16<T>>() {}
|
|
simdutf_really_inline base16(const __m256i _value)
|
|
: base<simd16<T>>(_value) {}
|
|
template <typename Pointer>
|
|
simdutf_really_inline base16(const Pointer *ptr)
|
|
: base16(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr))) {}
|
|
|
|
friend simdutf_always_inline Mask operator==(const simd16<T> lhs,
|
|
const simd16<T> rhs) {
|
|
return _mm256_cmpeq_epi16(lhs, rhs);
|
|
}
|
|
|
|
/// the size of vector in bytes
|
|
static const int SIZE = sizeof(base<simd16<T>>::value);
|
|
|
|
/// the number of elements of type T a vector can hold
|
|
static const int ELEMENTS = SIZE / sizeof(T);
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd16<bool> : base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) {
|
|
return _mm256_set1_epi16(uint16_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd16() : base16() {}
|
|
|
|
simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline bitmask_type to_bitmask() const {
|
|
return _mm256_movemask_epi8(*this);
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template <typename T> struct base16_numeric : base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) {
|
|
return _mm256_set1_epi16(_value);
|
|
}
|
|
|
|
static simdutf_really_inline simd16<T> zero() {
|
|
return _mm256_setzero_si256();
|
|
}
|
|
|
|
static simdutf_really_inline simd16<T> load(const T values[8]) {
|
|
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
|
|
simdutf_really_inline base16_numeric(const __m256i _value)
|
|
: base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const {
|
|
return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
|
|
return _mm256_add_epi16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
|
|
*this = *this + other;
|
|
return *static_cast<simd16<T> *>(this);
|
|
}
|
|
};
|
|
|
|
// Unsigned code units
|
|
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
simdutf_really_inline simd16(const __m256i _value)
|
|
: base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t *values)
|
|
: simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16<uint16_t>
|
|
max_val(const simd16<uint16_t> other) const {
|
|
return _mm256_max_epu16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<uint16_t>
|
|
min_val(const simd16<uint16_t> other) const {
|
|
return _mm256_min_epu16(*this, other);
|
|
}
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<bool>
|
|
operator<=(const simd16<uint16_t> other) const {
|
|
return other.max_val(*this) == other;
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator>=(const simd16<uint16_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd16<bool> bits_not_set() const {
|
|
return *this == uint16_t(0);
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool> any_bits_set() const {
|
|
return ~this->bits_not_set();
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
|
|
return simd16<uint16_t>(_mm256_srli_epi16(*this, N));
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
return _mm256_shuffle_epi8(*this, swap);
|
|
}
|
|
|
|
// Pack with the unsigned saturation of two uint16_t code units into single
|
|
// uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
|
|
const simd16<uint16_t> &v1) {
|
|
// Note: the AVX2 variant of pack operates on 128-bit lanes, thus
|
|
// we have to shuffle lanes in order to produce bytes in the
|
|
// correct order.
|
|
|
|
// get the 0th lanes
|
|
const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
|
|
const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
|
|
|
|
// get the 1st lanes
|
|
const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
|
|
const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
|
|
|
|
// build new vectors (shuffle lanes)
|
|
const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
|
|
const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
|
|
|
|
// pack code units in linear order from v0 and v1
|
|
return _mm256_packus_epi16(t0, t1);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
const auto lo_u16 = _mm256_and_si256(value, _mm256_set1_epi32(0x0000ffff));
|
|
const auto hi_u16 = _mm256_srli_epi32(value, 16);
|
|
const auto sum_u32 = _mm256_add_epi32(lo_u16, hi_u16);
|
|
|
|
const auto lo_u32 =
|
|
_mm256_and_si256(sum_u32, _mm256_set1_epi64x(0xffffffff));
|
|
const auto hi_u32 = _mm256_srli_epi64(sum_u32, 32);
|
|
const auto sum_u64 = _mm256_add_epi64(lo_u32, hi_u32);
|
|
|
|
return uint64_t(_mm256_extract_epi64(sum_u64, 0)) +
|
|
uint64_t(_mm256_extract_epi64(sum_u64, 1)) +
|
|
uint64_t(_mm256_extract_epi64(sum_u64, 2)) +
|
|
uint64_t(_mm256_extract_epi64(sum_u64, 3));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 2,
|
|
"Haswell kernel should use two registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
|
|
simd16x32<T> &
|
|
operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd16x32(const simd16<T> chunk0,
|
|
const simd16<T> chunk1)
|
|
: chunks{chunk0, chunk1} {}
|
|
simdutf_really_inline simd16x32(const T *ptr)
|
|
: chunks{simd16<T>::load(ptr),
|
|
simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r_hi = this->chunks[1].to_bitmask();
|
|
return r_lo | (r_hi << 32);
|
|
}
|
|
|
|
simdutf_really_inline simd16<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
|
|
const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
|
|
(this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
|
|
simd16<uint16_t> min(const simd16<uint16_t> a, simd16<uint16_t> b) {
|
|
return _mm256_min_epu16(a.value, b.value);
|
|
}
|
|
/* end file src/simdutf/haswell/simd16-inl.h */
|
|
/* begin file src/simdutf/haswell/simd32-inl.h */
|
|
template <typename T> struct simd32;
|
|
|
|
template <> struct simd32<uint32_t> {
|
|
static const size_t SIZE = sizeof(__m256i);
|
|
static const size_t ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
__m256i value;
|
|
|
|
simdutf_really_inline simd32(const __m256i v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd32(const Pointer *ptr)
|
|
: value(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr))) {}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
const __m256i mask = _mm256_set1_epi64x(0xffffffff);
|
|
const __m256i t0 = _mm256_and_si256(value, mask);
|
|
const __m256i t1 = _mm256_srli_epi64(value, 32);
|
|
const __m256i t2 = _mm256_add_epi64(t0, t1);
|
|
|
|
return uint64_t(_mm256_extract_epi64(t2, 0)) +
|
|
uint64_t(_mm256_extract_epi64(t2, 1)) +
|
|
uint64_t(_mm256_extract_epi64(t2, 2)) +
|
|
uint64_t(_mm256_extract_epi64(t2, 3));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> swap_bytes() const {
|
|
const __m256i shuffle =
|
|
_mm256_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12,
|
|
3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12);
|
|
|
|
return _mm256_shuffle_epi8(value, shuffle);
|
|
}
|
|
|
|
// operators
|
|
simdutf_really_inline simd32 &operator+=(const simd32 other) {
|
|
value = _mm256_add_epi32(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd32<uint32_t> zero() {
|
|
return _mm256_setzero_si256();
|
|
}
|
|
|
|
simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
|
|
return _mm256_set1_epi32(v);
|
|
}
|
|
};
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
template <> struct simd32<bool> {
|
|
// static const size_t SIZE = sizeof(__m128i);
|
|
// static const size_t ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
__m256i value;
|
|
|
|
simdutf_really_inline simd32(const __m256i v) : value(v) {}
|
|
|
|
simdutf_really_inline bool any() const {
|
|
return _mm256_movemask_epi8(value) != 0;
|
|
}
|
|
};
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd32<T> operator|(const simd32<T> a,
|
|
const simd32<T> b) {
|
|
return _mm256_or_si256(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> b,
|
|
const simd32<uint32_t> a) {
|
|
return _mm256_min_epu32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm256_max_epu32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> b,
|
|
const simd32<uint32_t> a) {
|
|
return _mm256_and_si256(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm256_add_epi32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm256_cmpeq_epi32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm256_cmpeq_epi32(_mm256_max_epu32(a.value, b.value), a.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator!(const simd32<bool> v) {
|
|
return _mm256_xor_si256(v.value, _mm256_set1_epi8(-1));
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return !(b >= a);
|
|
}
|
|
/* end file src/simdutf/haswell/simd32-inl.h */
|
|
/* begin file src/simdutf/haswell/simd64-inl.h */
|
|
template <typename T> struct simd64;
|
|
|
|
template <> struct simd64<uint64_t> {
|
|
// static const size_t SIZE = sizeof(__m256i);
|
|
// static const size_t ELEMENTS = SIZE / sizeof(uint64_t);
|
|
|
|
__m256i value;
|
|
|
|
simdutf_really_inline simd64(const __m256i v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd64(const Pointer *ptr)
|
|
: value(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr))) {}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
return _mm256_extract_epi64(value, 0) + _mm256_extract_epi64(value, 1) +
|
|
_mm256_extract_epi64(value, 2) + _mm256_extract_epi64(value, 3);
|
|
}
|
|
|
|
// operators
|
|
simdutf_really_inline simd64 &operator+=(const simd64 other) {
|
|
value = _mm256_add_epi64(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd64<uint64_t> zero() {
|
|
return _mm256_setzero_si256();
|
|
}
|
|
|
|
simdutf_really_inline static simd64<uint64_t> splat(uint64_t v) {
|
|
return _mm256_set1_epi64x(v);
|
|
}
|
|
};
|
|
/* end file src/simdutf/haswell/simd64-inl.h */
|
|
|
|
simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
|
|
return _mm256_sad_epu8(v.value, simd8<uint8_t>::zero());
|
|
}
|
|
|
|
} // namespace simd
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_HASWELL_SIMD_H
|
|
/* end file src/simdutf/haswell/simd.h */
|
|
|
|
/* begin file src/simdutf/haswell/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
#undef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_HASWELL
|
|
#endif // SIMDUTF_HASWELL_COMMON_H
|
|
/* end file src/simdutf/haswell.h */
|
|
/* begin file src/simdutf/westmere.h */
|
|
#ifndef SIMDUTF_WESTMERE_H
|
|
#define SIMDUTF_WESTMERE_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "westmere.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
// Default Westmere to on if this is x86-64, unless we'll always select Haswell.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
//
|
|
// You do not want to set it to (SIMDUTF_IS_X86_64 &&
|
|
// !SIMDUTF_REQUIRES_HASWELL) because you want to rely on runtime dispatch!
|
|
//
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
#define SIMDUTF_IMPLEMENTATION_WESTMERE 0
|
|
#else
|
|
#define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#if (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 1
|
|
#else
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 0
|
|
#endif
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
|
|
#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for Westmere (Intel SSE4.2).
|
|
*/
|
|
namespace westmere {} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
//
|
|
// These two need to be included outside SIMDUTF_TARGET_REGION
|
|
//
|
|
/* begin file src/simdutf/westmere/implementation.h */
|
|
#ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
|
|
#define SIMDUTF_WESTMERE_IMPLEMENTATION_H
|
|
|
|
|
|
// The constructor may be executed on any host, so we take care not to use
|
|
// SIMDUTF_TARGET_REGION
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("westmere", "Intel/AMD SSE4.2",
|
|
internal::instruction_set::SSE42) {}
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
};
|
|
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
|
|
/* end file src/simdutf/westmere/implementation.h */
|
|
/* begin file src/simdutf/westmere/intrinsics.h */
|
|
#ifndef SIMDUTF_WESTMERE_INTRINSICS_H
|
|
#define SIMDUTF_WESTMERE_INTRINSICS_H
|
|
|
|
#ifdef SIMDUTF_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// We should not get warnings while including <x86intrin.h> yet we do
|
|
// under some versions of GCC.
|
|
// If the x86intrin.h header has uninitialized values that are problematic,
|
|
// it is a GCC issue, we want to ignore these warnings.
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
|
|
#endif
|
|
|
|
#include <x86intrin.h> // elsewhere
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// cancels the suppression of the -Wuninitialized
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif
|
|
|
|
#endif // SIMDUTF_VISUAL_STUDIO
|
|
|
|
#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
*/
|
|
#include <smmintrin.h> // for _mm_alignr_epi8
|
|
#endif
|
|
|
|
#endif // SIMDUTF_WESTMERE_INTRINSICS_H
|
|
/* end file src/simdutf/westmere/intrinsics.h */
|
|
|
|
//
|
|
// The rest need to be inside the region
|
|
//
|
|
/* begin file src/simdutf/westmere/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "westmere"
|
|
// #define SIMDUTF_IMPLEMENTATION westmere
|
|
#define SIMDUTF_SIMD_HAS_BYTEMASK 1
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_WESTMERE
|
|
#endif
|
|
/* end file src/simdutf/westmere/begin.h */
|
|
|
|
// Declarations
|
|
/* begin file src/simdutf/westmere/bitmanipulation.h */
|
|
#ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
|
|
#define SIMDUTF_WESTMERE_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num); // Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
#if SIMDUTF_NEED_TRAILING_ZEROES
|
|
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
}
|
|
#endif
|
|
|
|
template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
|
|
/* end file src/simdutf/westmere/bitmanipulation.h */
|
|
/* begin file src/simdutf/westmere/simd.h */
|
|
#ifndef SIMDUTF_WESTMERE_SIMD_H
|
|
#define SIMDUTF_WESTMERE_SIMD_H
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template <typename Child> struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base(const __m128i _value) : value(_value) {}
|
|
// Conversion to SIMD register
|
|
simdutf_really_inline operator const __m128i &() const { return this->value; }
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
|
|
__m128i first = _mm_cvtepu8_epi16(*this);
|
|
__m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this, 8));
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
first = _mm_shuffle_epi8(first, swap);
|
|
second = _mm_shuffle_epi8(second, swap);
|
|
}
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8), second);
|
|
}
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p + 4),
|
|
_mm_cvtepu8_epi32(_mm_srli_si128(*this, 4)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8),
|
|
_mm_cvtepu8_epi32(_mm_srli_si128(*this, 8)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p + 12),
|
|
_mm_cvtepu8_epi32(_mm_srli_si128(*this, 12)));
|
|
}
|
|
// Bit operations
|
|
simdutf_really_inline Child operator|(const Child other) const {
|
|
return _mm_or_si128(*this, other);
|
|
}
|
|
simdutf_really_inline Child operator&(const Child other) const {
|
|
return _mm_and_si128(*this, other);
|
|
}
|
|
simdutf_really_inline Child operator^(const Child other) const {
|
|
return _mm_xor_si128(*this, other);
|
|
}
|
|
simdutf_really_inline Child &operator|=(const Child other) {
|
|
auto this_cast = static_cast<Child *>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template <typename T> struct simd8;
|
|
|
|
template <typename T, typename Mask = simd8<bool>>
|
|
struct base8 : base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdutf_really_inline T first() const { return _mm_extract_epi8(*this, 0); }
|
|
simdutf_really_inline T last() const { return _mm_extract_epi8(*this, 15); }
|
|
simdutf_really_inline base8() : base<simd8<T>>() {}
|
|
simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
|
|
const simd8<T> rhs) {
|
|
return _mm_cmpeq_epi8(lhs, rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base8<bool> {
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) {
|
|
return _mm_set1_epi8(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd8() : base8() {}
|
|
simdutf_really_inline simd8(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline int to_bitmask() const {
|
|
return _mm_movemask_epi8(*this);
|
|
}
|
|
simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template <typename T> struct base8_numeric : base8<T> {
|
|
static simdutf_really_inline simd8<T> splat(T _value) {
|
|
return _mm_set1_epi8(_value);
|
|
}
|
|
static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
|
|
static simdutf_really_inline simd8<T> load(const T values[16]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
|
|
T v5, T v6, T v7, T v8, T v9,
|
|
T v10, T v11, T v12, T v13,
|
|
T v14, T v15) {
|
|
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
|
|
v14, v15);
|
|
}
|
|
|
|
simdutf_really_inline base8_numeric() : base8<T>() {}
|
|
simdutf_really_inline base8_numeric(const __m128i _value)
|
|
: base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[16]) const {
|
|
return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
|
|
return _mm_sub_epi8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
|
|
*this = *this - other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdutf_really_inline simd8(const __m128i _value)
|
|
: base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline operator simd8<uint8_t>() const;
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return _mm_movemask_epi8(*this) == 0;
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
|
|
return _mm_cmpgt_epi8(*this, other);
|
|
}
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
|
|
return _mm_cmpgt_epi8(other, *this);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdutf_really_inline simd8(const __m128i _value)
|
|
: base8_numeric<uint8_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8(_mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15)) {}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t>
|
|
saturating_sub(const simd8<uint8_t> other) const {
|
|
return _mm_subs_epu8(*this, other);
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd8<uint8_t>
|
|
min_val(const simd8<uint8_t> other) const {
|
|
return _mm_min_epu8(*this, other);
|
|
}
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> bits_not_set() const {
|
|
return *this == uint8_t(0);
|
|
}
|
|
simdutf_really_inline simd8<bool> any_bits_set() const {
|
|
return ~this->bits_not_set();
|
|
}
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return _mm_movemask_epi8(*this) == 0;
|
|
}
|
|
|
|
simdutf_really_inline bool bits_not_set_anywhere() const {
|
|
return _mm_testz_si128(*this, *this);
|
|
}
|
|
simdutf_really_inline bool any_bits_set_anywhere() const {
|
|
return !bits_not_set_anywhere();
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
|
|
return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N);
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
|
|
return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum_bytes() const {
|
|
const auto tmp = _mm_sad_epu8(value, _mm_setzero_si128());
|
|
return _mm_extract_epi64(tmp, 0) + _mm_extract_epi64(tmp, 1);
|
|
}
|
|
};
|
|
|
|
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
|
|
return this->value;
|
|
}
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"Westmere kernel should use four registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
|
|
const simd8<T> chunk2, const simd8<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd8x64(const T *ptr)
|
|
: chunks{simd8<T>::load(ptr),
|
|
simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
this->chunks[2] |= other.chunks[2];
|
|
this->chunks[3] |= other.chunks[3];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 1);
|
|
this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 2);
|
|
this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
|
|
this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
|
|
this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r1 = this->chunks[1].to_bitmask();
|
|
uint64_t r2 = this->chunks[2].to_bitmask();
|
|
uint64_t r3 = this->chunks[3].to_bitmask();
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
|
|
this->chunks[2] < mask, this->chunks[3] < mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
|
|
this->chunks[2] > mask, this->chunks[3] > mask)
|
|
.to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
|
|
simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
|
|
simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
|
|
simd8<uint8_t>(__m128i(this->chunks[3])) >= mask)
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
/* begin file src/simdutf/westmere/simd16-inl.h */
|
|
template <typename T> struct simd16;
|
|
|
|
template <typename T, typename Mask = simd16<bool>>
|
|
struct base16 : base<simd16<T>> {
|
|
simdutf_really_inline base16() : base<simd16<T>>() {}
|
|
|
|
simdutf_really_inline base16(const __m128i _value)
|
|
: base<simd16<T>>(_value) {}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
|
|
const simd16<T> rhs) {
|
|
return _mm_cmpeq_epi16(lhs, rhs);
|
|
}
|
|
|
|
/// the size of vector in bytes
|
|
static const int SIZE = sizeof(base<simd16<T>>::value);
|
|
|
|
/// the number of elements of type T a vector can hold
|
|
static const int ELEMENTS = SIZE / sizeof(T);
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd16<bool> : base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) {
|
|
return _mm_set1_epi16(uint16_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline int to_bitmask() const {
|
|
return _mm_movemask_epi8(*this);
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template <typename T> struct base16_numeric : base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) {
|
|
return _mm_set1_epi16(_value);
|
|
}
|
|
|
|
static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
|
|
|
|
static simdutf_really_inline simd16<T> load(const T values[8]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
|
|
simdutf_really_inline base16_numeric(const __m128i _value)
|
|
: base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const {
|
|
return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
|
|
return _mm_add_epi16(*this, other);
|
|
}
|
|
simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
|
|
*this = *this + other;
|
|
return *static_cast<simd16<T> *>(this);
|
|
}
|
|
};
|
|
|
|
// Unsigned code units
|
|
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
|
|
simdutf_really_inline simd16(const __m128i _value)
|
|
: base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const char16_t *values)
|
|
: simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16<uint16_t>
|
|
max_val(const simd16<uint16_t> other) const {
|
|
return _mm_max_epu16(*this, other);
|
|
}
|
|
|
|
simdutf_really_inline simd16<uint16_t>
|
|
min_val(const simd16<uint16_t> other) const {
|
|
return _mm_min_epu16(*this, other);
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool>
|
|
operator<=(const simd16<uint16_t> other) const {
|
|
return other.max_val(*this) == other;
|
|
}
|
|
simdutf_really_inline simd16<bool>
|
|
operator>=(const simd16<uint16_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd16<bool> bits_not_set() const {
|
|
return *this == uint16_t(0);
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool> any_bits_set() const {
|
|
return ~this->bits_not_set();
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
|
|
return simd16<uint16_t>(_mm_srli_epi16(*this, N));
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
return _mm_shuffle_epi8(*this, swap);
|
|
}
|
|
|
|
// Pack with the unsigned saturation of two uint16_t code units into single
|
|
// uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
|
|
const simd16<uint16_t> &v1) {
|
|
return _mm_packus_epi16(v0, v1);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
const auto lo_u16 = _mm_and_si128(value, _mm_set1_epi32(0x0000ffff));
|
|
const auto hi_u16 = _mm_srli_epi32(value, 16);
|
|
const auto sum_u32 = _mm_add_epi32(lo_u16, hi_u16);
|
|
|
|
const auto lo_u32 = _mm_and_si128(sum_u32, _mm_set1_epi64x(0xffffffff));
|
|
const auto hi_u32 = _mm_srli_epi64(sum_u32, 32);
|
|
const auto sum_u64 = _mm_add_epi64(lo_u32, hi_u32);
|
|
|
|
return uint64_t(_mm_extract_epi64(sum_u64, 0)) +
|
|
uint64_t(_mm_extract_epi64(sum_u64, 1));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"Westmere kernel should use four registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
|
|
simd16x32<T> &
|
|
operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline
|
|
simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
|
|
const simd16<T> chunk2, const simd16<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd16x32(const T *ptr)
|
|
: chunks{simd16<T>::load(ptr),
|
|
simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd16<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
|
|
this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
|
|
this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r1 = this->chunks[1].to_bitmask();
|
|
uint64_t r2 = this->chunks[2].to_bitmask();
|
|
uint64_t r3 = this->chunks[3].to_bitmask();
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
this->chunks[2] = this->chunks[2].swap_bytes();
|
|
this->chunks[3] = this->chunks[3].swap_bytes();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask, this->chunks[3] <= mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
|
|
const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
|
|
(this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
|
|
(this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
|
|
(this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
|
|
simd16<uint16_t> min(const simd16<uint16_t> a, simd16<uint16_t> b) {
|
|
return _mm_min_epu16(a.value, b.value);
|
|
}
|
|
/* end file src/simdutf/westmere/simd16-inl.h */
|
|
/* begin file src/simdutf/westmere/simd32-inl.h */
|
|
template <typename T> struct simd32;
|
|
|
|
template <> struct simd32<uint32_t> {
|
|
static const size_t SIZE = sizeof(__m128i);
|
|
static const size_t ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
__m128i value;
|
|
|
|
simdutf_really_inline simd32(const __m128i v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd32(const Pointer *ptr)
|
|
: value(_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr))) {}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
return uint64_t(_mm_extract_epi32(value, 0)) +
|
|
uint64_t(_mm_extract_epi32(value, 1)) +
|
|
uint64_t(_mm_extract_epi32(value, 2)) +
|
|
uint64_t(_mm_extract_epi32(value, 3));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> swap_bytes() const {
|
|
const __m128i shuffle =
|
|
_mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12);
|
|
|
|
return _mm_shuffle_epi8(value, shuffle);
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
|
|
return _mm_srli_epi32(value, N);
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
|
|
return _mm_slli_epi32(value, N);
|
|
}
|
|
|
|
void dump() const {
|
|
printf("[%08x, %08x, %08x, %08x]\n", uint32_t(_mm_extract_epi32(value, 0)),
|
|
uint32_t(_mm_extract_epi32(value, 1)),
|
|
uint32_t(_mm_extract_epi32(value, 2)),
|
|
uint32_t(_mm_extract_epi32(value, 3)));
|
|
}
|
|
|
|
// operators
|
|
simdutf_really_inline simd32 &operator+=(const simd32 other) {
|
|
value = _mm_add_epi32(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd32<uint32_t> zero() {
|
|
return _mm_setzero_si128();
|
|
}
|
|
|
|
simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
|
|
return _mm_set1_epi32(v);
|
|
}
|
|
};
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
template <> struct simd32<bool> {
|
|
// static const size_t SIZE = sizeof(__m128i);
|
|
// static const size_t ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
__m128i value;
|
|
|
|
simdutf_really_inline simd32(const __m128i v) : value(v) {}
|
|
|
|
simdutf_really_inline bool any() const {
|
|
return _mm_movemask_epi8(value) != 0;
|
|
}
|
|
|
|
simdutf_really_inline uint8_t to_4bit_bitmask() const {
|
|
return uint8_t(_mm_movemask_ps(_mm_castsi128_ps(value)));
|
|
}
|
|
};
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd32<T> operator|(const simd32<T> a,
|
|
const simd32<T> b) {
|
|
return _mm_or_si128(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm_min_epu32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm_max_epu32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return _mm_cmpeq_epi32(a.value, _mm_set1_epi32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm_and_si128(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return _mm_and_si128(a.value, _mm_set1_epi32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator|(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return _mm_or_si128(a.value, _mm_set1_epi32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm_add_epi32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator-(const simd32<uint32_t> a,
|
|
uint32_t b) {
|
|
return _mm_sub_epi32(a.value, _mm_set1_epi32(b));
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm_cmpeq_epi32(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return _mm_cmpeq_epi32(_mm_max_epu32(a.value, b.value), a.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator!(const simd32<bool> v) {
|
|
return _mm_xor_si128(v.value, _mm_set1_epi8(-1));
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return !(b >= a);
|
|
}
|
|
|
|
simdutf_really_inline simd32<uint32_t> select(const simd32<bool> cond,
|
|
const simd32<uint32_t> v_true,
|
|
const simd32<uint32_t> v_false) {
|
|
return _mm_blendv_epi8(v_false.value, v_true.value, cond.value);
|
|
}
|
|
/* end file src/simdutf/westmere/simd32-inl.h */
|
|
/* begin file src/simdutf/westmere/simd64-inl.h */
|
|
template <typename T> struct simd64;
|
|
|
|
template <> struct simd64<uint64_t> {
|
|
// static const size_t SIZE = sizeof(__m128i);
|
|
// static const size_t ELEMENTS = SIZE / sizeof(uint64_t);
|
|
|
|
__m128i value;
|
|
|
|
simdutf_really_inline simd64(const __m128i v) : value(v) {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline simd64(const Pointer *ptr)
|
|
: value(_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr))) {}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
return _mm_extract_epi64(value, 0) + _mm_extract_epi64(value, 1);
|
|
}
|
|
|
|
// operators
|
|
simdutf_really_inline simd64 &operator+=(const simd64 other) {
|
|
value = _mm_add_epi64(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// static members
|
|
simdutf_really_inline static simd64<uint64_t> zero() {
|
|
return _mm_setzero_si128();
|
|
}
|
|
|
|
simdutf_really_inline static simd64<uint64_t> splat(uint64_t v) {
|
|
return _mm_set1_epi64x(v);
|
|
}
|
|
};
|
|
/* end file src/simdutf/westmere/simd64-inl.h */
|
|
|
|
simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
|
|
return _mm_sad_epu8(v.value, simd8<uint8_t>::zero());
|
|
}
|
|
|
|
simdutf_really_inline simd8<uint8_t> as_vector_u8(const simd32<uint32_t> v) {
|
|
return simd8<uint8_t>(v.value);
|
|
}
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
|
|
/* end file src/simdutf/westmere/simd.h */
|
|
|
|
/* begin file src/simdutf/westmere/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
#undef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
/* end file src/simdutf/westmere/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
#endif // SIMDUTF_WESTMERE_COMMON_H
|
|
/* end file src/simdutf/westmere.h */
|
|
/* begin file src/simdutf/ppc64.h */
|
|
#ifndef SIMDUTF_PPC64_H
|
|
#define SIMDUTF_PPC64_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "ppc64.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
#ifndef SIMDUTF_IMPLEMENTATION_PPC64
|
|
#define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
|
|
#endif
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 \
|
|
SIMDUTF_IMPLEMENTATION_PPC64 &&SIMDUTF_IS_PPC64
|
|
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for ALTIVEC (PPC64).
|
|
*/
|
|
namespace ppc64 {} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/ppc64/implementation.h */
|
|
#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
|
|
#define SIMDUTF_PPC64_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
|
|
template <size_t N> simdutf_really_inline size_t align_down(size_t size) {
|
|
return N * (size / N);
|
|
}
|
|
} // namespace
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("ppc64", "PPC64 ALTIVEC",
|
|
internal::instruction_set::ALTIVEC) {}
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused size_t maximal_binary_length_from_base64(
|
|
const char *input, size_t length) const noexcept;
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
#ifdef SIMDUTF_INTERNAL_TESTS
|
|
virtual std::vector<TestProcedure> internal_tests() const override;
|
|
#endif
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
};
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_PPC64_IMPLEMENTATION_H
|
|
/* end file src/simdutf/ppc64/implementation.h */
|
|
|
|
/* begin file src/simdutf/ppc64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
|
|
// #define SIMDUTF_IMPLEMENTATION ppc64
|
|
/* end file src/simdutf/ppc64/begin.h */
|
|
|
|
// Declarations
|
|
/* begin file src/simdutf/ppc64/intrinsics.h */
|
|
#ifndef SIMDUTF_PPC64_INTRINSICS_H
|
|
#define SIMDUTF_PPC64_INTRINSICS_H
|
|
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <altivec.h>
|
|
|
|
// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
|
|
#ifdef bool
|
|
#undef bool
|
|
#endif
|
|
|
|
#ifdef vector
|
|
#undef vector
|
|
#endif
|
|
|
|
#endif // SIMDUTF_PPC64_INTRINSICS_H
|
|
/* end file src/simdutf/ppc64/intrinsics.h */
|
|
/* begin file src/simdutf/ppc64/bitmanipulation.h */
|
|
#ifndef SIMDUTF_PPC64_BITMANIPULATION_H
|
|
#define SIMDUTF_PPC64_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num); // Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
return __builtin_popcountll(input_num);
|
|
}
|
|
#endif
|
|
|
|
#if SIMDUTF_NEED_TRAILING_ZEROES
|
|
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
|
|
return __builtin_ctzll(input_num);
|
|
}
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_PPC64_BITMANIPULATION_H
|
|
/* end file src/simdutf/ppc64/bitmanipulation.h */
|
|
/* begin file src/simdutf/ppc64/simd.h */
|
|
#ifndef SIMDUTF_PPC64_SIMD_H
|
|
#define SIMDUTF_PPC64_SIMD_H
|
|
|
|
#include <type_traits>
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
using vec_bool_t = __vector __bool char;
|
|
using vec_bool16_t = __vector __bool short;
|
|
using vec_bool32_t = __vector __bool int;
|
|
using vec_u8_t = __vector unsigned char;
|
|
using vec_i8_t = __vector signed char;
|
|
using vec_u16_t = __vector unsigned short;
|
|
using vec_i16_t = __vector signed short;
|
|
using vec_u32_t = __vector unsigned int;
|
|
using vec_i32_t = __vector signed int;
|
|
using vec_u64_t = __vector unsigned long long;
|
|
using vec_i64_t = __vector signed long long;
|
|
|
|
// clang-format off
|
|
template <typename T> struct vector_u8_type_for_element_aux {
|
|
using type = typename std::conditional<std::is_same<T, bool>::value, vec_bool_t,
|
|
typename std::conditional<std::is_same<T, uint8_t>::value, vec_u8_t,
|
|
typename std::conditional<std::is_same<T, int8_t>::value, vec_i8_t, void>::type>::type>::type;
|
|
|
|
static_assert(not std::is_same<type, void>::value,
|
|
"accepted element types are 8 bit integers or bool");
|
|
};
|
|
|
|
template <typename T> struct vector_u16_type_for_element_aux {
|
|
using type = typename std::conditional<std::is_same<T, bool>::value, vec_bool16_t,
|
|
typename std::conditional<std::is_same<T, uint16_t>::value, vec_u16_t,
|
|
typename std::conditional<std::is_same<T, int16_t>::value, vec_i16_t, void>::type>::type>::type;
|
|
|
|
static_assert(not std::is_same<type, void>::value,
|
|
"accepted element types are 16 bit integers or bool");
|
|
};
|
|
|
|
template <typename T> struct vector_u32_type_for_element_aux {
|
|
using type = typename std::conditional<std::is_same<T, bool>::value, vec_bool32_t,
|
|
typename std::conditional<std::is_same<T, uint32_t>::value, vec_u32_t,
|
|
typename std::conditional<std::is_same<T, int32_t>::value, vec_i32_t, void>::type>::type>::type;
|
|
|
|
static_assert(not std::is_same<type, void>::value,
|
|
"accepted element types are 32 bit integers or bool");
|
|
};
|
|
// clang-format on
|
|
|
|
template <typename T>
|
|
using vector_u8_type_for_element =
|
|
typename vector_u8_type_for_element_aux<T>::type;
|
|
|
|
template <typename T>
|
|
using vector_u16_type_for_element =
|
|
typename vector_u16_type_for_element_aux<T>::type;
|
|
|
|
template <typename T>
|
|
using vector_u32_type_for_element =
|
|
typename vector_u32_type_for_element_aux<T>::type;
|
|
|
|
template <typename T> uint16_t move_mask_u8(T vec) {
|
|
const vec_u8_t perm_mask = {15 * 8, 14 * 8, 13 * 8, 12 * 8, 11 * 8, 10 * 8,
|
|
9 * 8, 8 * 8, 7 * 8, 6 * 8, 5 * 8, 4 * 8,
|
|
3 * 8, 2 * 8, 1 * 8, 0 * 8};
|
|
|
|
const auto result = (vec_u64_t)vec_vbpermq((vec_u8_t)vec, perm_mask);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return static_cast<uint16_t>(result[0]);
|
|
#else
|
|
return static_cast<uint16_t>(result[1]);
|
|
#endif
|
|
}
|
|
|
|
/* begin file src/simdutf/ppc64/simd8-inl.h */
|
|
// file included directly
|
|
|
|
template <typename T> struct base8 {
|
|
using vector_type = vector_u8_type_for_element<T>;
|
|
vector_type value;
|
|
static const int SIZE = sizeof(vector_type);
|
|
static const int ELEMENTS = sizeof(vector_type) / sizeof(T);
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base8() : value{vec_splats(T(0))} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base8(const vector_type _value) : value{_value} {}
|
|
|
|
// Splat scalar
|
|
simdutf_really_inline base8(T v) : value{vec_splats(v)} {}
|
|
|
|
// Conversion to SIMD register
|
|
simdutf_really_inline operator const vector_type &() const {
|
|
return this->value;
|
|
}
|
|
|
|
template <typename U> simdutf_really_inline void store(U *ptr) const {
|
|
vec_xst(value, 0, reinterpret_cast<T *>(ptr));
|
|
}
|
|
|
|
template <typename SIMD8> void operator|=(const SIMD8 other) {
|
|
this->value = vec_or(this->value, other.value);
|
|
}
|
|
|
|
template <int N = 1> vector_type prev_aux(vector_type prev_chunk) const {
|
|
vector_type chunk = this->value;
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
chunk = (vector_type)vec_reve(this->value);
|
|
prev_chunk = (vector_type)vec_reve((vector_type)prev_chunk);
|
|
#endif
|
|
chunk = (vector_type)vec_sld((vector_type)prev_chunk, (vector_type)chunk,
|
|
16 - N);
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
chunk = (vector_type)vec_reve((vector_type)chunk);
|
|
#endif
|
|
return chunk;
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return move_mask_u8(this->value) == 0;
|
|
}
|
|
|
|
simdutf_really_inline uint16_t to_bitmask() const {
|
|
return move_mask_u8(value);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_bytes_as_utf16(char16_t *p) const {
|
|
const vector_type zero = vec_splats(T(0));
|
|
|
|
if (big_endian) {
|
|
const vec_u8_t perm_lo = {16, 0, 16, 1, 16, 2, 16, 3,
|
|
16, 4, 16, 5, 16, 6, 16, 7};
|
|
const vec_u8_t perm_hi = {16, 8, 16, 9, 16, 10, 16, 11,
|
|
16, 12, 16, 13, 16, 14, 16, 15};
|
|
|
|
const vector_type v0 = vec_perm(value, zero, perm_lo);
|
|
const vector_type v1 = vec_perm(value, zero, perm_hi);
|
|
|
|
#if defined(__clang__)
|
|
vec_xst(v0, 0, reinterpret_cast<T *>(p));
|
|
vec_xst(v1, 16, reinterpret_cast<T *>(p));
|
|
#else
|
|
vec_xst(v0, 0, reinterpret_cast<vector_type *>(p));
|
|
vec_xst(v1, 16, reinterpret_cast<vector_type *>(p));
|
|
#endif // defined(__clang__)
|
|
} else {
|
|
const vec_u8_t perm_lo = {0, 16, 1, 16, 2, 16, 3, 16,
|
|
4, 16, 5, 16, 6, 16, 7, 16};
|
|
const vec_u8_t perm_hi = {8, 16, 9, 16, 10, 16, 11, 16,
|
|
12, 16, 13, 16, 14, 16, 15, 16};
|
|
|
|
const vector_type v0 = vec_perm(value, zero, perm_lo);
|
|
const vector_type v1 = vec_perm(value, zero, perm_hi);
|
|
|
|
#if defined(__clang__)
|
|
vec_xst(v0, 0, reinterpret_cast<T *>(p));
|
|
vec_xst(v1, 16, reinterpret_cast<T *>(p));
|
|
#else
|
|
vec_xst(v0, 0, reinterpret_cast<vector_type *>(p));
|
|
vec_xst(v1, 16, reinterpret_cast<vector_type *>(p));
|
|
#endif // defined(__clang__)
|
|
}
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
|
|
store_bytes_as_utf16<big_endian>(p);
|
|
}
|
|
|
|
simdutf_really_inline void store_bytes_as_utf32(char32_t *p) const {
|
|
const vector_type zero = vec_splats(T(0));
|
|
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const vec_u8_t perm0 = {16, 16, 16, 0, 16, 16, 16, 1,
|
|
16, 16, 16, 2, 16, 16, 16, 3};
|
|
|
|
const vec_u8_t perm1 = {16, 16, 16, 4, 16, 16, 16, 5,
|
|
16, 16, 16, 6, 16, 16, 16, 7};
|
|
|
|
const vec_u8_t perm2 = {16, 16, 16, 8, 16, 16, 16, 9,
|
|
16, 16, 16, 10, 16, 16, 16, 11};
|
|
|
|
const vec_u8_t perm3 = {16, 16, 16, 12, 16, 16, 16, 13,
|
|
16, 16, 16, 14, 16, 16, 16, 15};
|
|
#else
|
|
const vec_u8_t perm0 = {0, 16, 16, 16, 1, 16, 16, 16,
|
|
2, 16, 16, 16, 3, 16, 16, 16};
|
|
|
|
const vec_u8_t perm1 = {4, 16, 16, 16, 5, 16, 16, 16,
|
|
6, 16, 16, 16, 7, 16, 16, 16};
|
|
|
|
const vec_u8_t perm2 = {8, 16, 16, 16, 9, 16, 16, 16,
|
|
10, 16, 16, 16, 11, 16, 16, 16};
|
|
|
|
const vec_u8_t perm3 = {12, 16, 16, 16, 13, 16, 16, 16,
|
|
14, 16, 16, 16, 15, 16, 16, 16};
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
|
|
const vector_type v0 = vec_perm(value, zero, perm0);
|
|
const vector_type v1 = vec_perm(value, zero, perm1);
|
|
const vector_type v2 = vec_perm(value, zero, perm2);
|
|
const vector_type v3 = vec_perm(value, zero, perm3);
|
|
|
|
constexpr size_t n = base8<T>::SIZE;
|
|
|
|
#if defined(__clang__)
|
|
vec_xst(v0, 0 * n, reinterpret_cast<T *>(p));
|
|
vec_xst(v1, 1 * n, reinterpret_cast<T *>(p));
|
|
vec_xst(v2, 2 * n, reinterpret_cast<T *>(p));
|
|
vec_xst(v3, 3 * n, reinterpret_cast<T *>(p));
|
|
#else
|
|
vec_xst(v0, 0 * n, reinterpret_cast<vector_type *>(p));
|
|
vec_xst(v1, 1 * n, reinterpret_cast<vector_type *>(p));
|
|
vec_xst(v2, 2 * n, reinterpret_cast<vector_type *>(p));
|
|
vec_xst(v3, 3 * n, reinterpret_cast<vector_type *>(p));
|
|
#endif // defined(__clang__)
|
|
}
|
|
|
|
simdutf_really_inline void store_words_as_utf32(char32_t *p) const {
|
|
const vector_type zero = vec_splats(T(0));
|
|
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const vec_u8_t perm0 = {16, 16, 0, 1, 16, 16, 2, 3,
|
|
16, 16, 4, 5, 16, 16, 6, 7};
|
|
const vec_u8_t perm1 = {16, 16, 8, 9, 16, 16, 10, 11,
|
|
16, 16, 12, 13, 16, 16, 14, 15};
|
|
#else
|
|
const vec_u8_t perm0 = {0, 1, 16, 16, 2, 3, 16, 16,
|
|
4, 5, 16, 16, 6, 7, 16, 16};
|
|
const vec_u8_t perm1 = {8, 9, 16, 16, 10, 11, 16, 16,
|
|
12, 13, 16, 16, 14, 15, 16, 16};
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
|
|
const vector_type v0 = vec_perm(value, zero, perm0);
|
|
const vector_type v1 = vec_perm(value, zero, perm1);
|
|
|
|
constexpr size_t n = base8<T>::SIZE;
|
|
|
|
#if defined(__clang__)
|
|
vec_xst(v0, 0 * n, reinterpret_cast<T *>(p));
|
|
vec_xst(v1, 1 * n, reinterpret_cast<T *>(p));
|
|
#else
|
|
vec_xst(v0, 0 * n, reinterpret_cast<vector_type *>(p));
|
|
vec_xst(v1, 1 * n, reinterpret_cast<vector_type *>(p));
|
|
#endif // defined(__clang__)
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
|
|
store_bytes_as_utf32(p);
|
|
}
|
|
};
|
|
|
|
// Forward declaration
|
|
template <typename T> struct simd8;
|
|
|
|
template <typename T>
|
|
simd8<bool> operator==(const simd8<T> a, const simd8<T> b);
|
|
|
|
template <typename T>
|
|
simd8<bool> operator!=(const simd8<T> a, const simd8<T> b);
|
|
|
|
template <typename T> simd8<T> operator&(const simd8<T> a, const simd8<T> b);
|
|
|
|
template <typename T> simd8<T> operator|(const simd8<T> a, const simd8<T> b);
|
|
|
|
template <typename T> simd8<T> operator^(const simd8<T> a, const simd8<T> b);
|
|
|
|
template <typename T> simd8<T> operator+(const simd8<T> a, const simd8<T> b);
|
|
|
|
template <typename T> simd8<bool> operator<(const simd8<T> a, const simd8<T> b);
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base8<bool> {
|
|
using super = base8<bool>;
|
|
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) {
|
|
return (vector_type)vec_splats((unsigned char)(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd8() : super(vector_type()) {}
|
|
simdutf_really_inline simd8(const vector_type _value) : super(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd8(simd8<T> other)
|
|
: simd8(vector_type(other.value)) {}
|
|
|
|
simdutf_really_inline uint16_t to_bitmask() const {
|
|
return move_mask_u8(value);
|
|
}
|
|
|
|
simdutf_really_inline bool any() const {
|
|
return !vec_all_eq(this->value, (vector_type)vec_splats(0));
|
|
}
|
|
|
|
simdutf_really_inline bool all() const { return to_bitmask() == 0xffff; }
|
|
|
|
simdutf_really_inline simd8<bool> operator~() const {
|
|
return this->value ^ (vector_type)splat(true);
|
|
}
|
|
};
|
|
|
|
template <typename T> struct base8_numeric : base8<T> {
|
|
using super = base8<T>;
|
|
using vector_type = typename super::vector_type;
|
|
|
|
static simdutf_really_inline simd8<T> splat(T value) {
|
|
return (vector_type)vec_splats(value);
|
|
}
|
|
|
|
static simdutf_really_inline simd8<T> zero() { return splat(0); }
|
|
|
|
template <typename U>
|
|
static simdutf_really_inline simd8<T> load(const U *values) {
|
|
return vec_xl(0, reinterpret_cast<const T *>(values));
|
|
}
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
|
|
T v5, T v6, T v7, T v8, T v9,
|
|
T v10, T v11, T v12, T v13,
|
|
T v14, T v15) {
|
|
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
|
|
v14, v15);
|
|
}
|
|
|
|
simdutf_really_inline base8_numeric() : base8<T>() {}
|
|
simdutf_really_inline base8_numeric(const vector_type _value)
|
|
: base8<T>(_value) {}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
|
|
this->value = vec_sub(this->value, other.value);
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return (vector_type)vec_perm((vector_type)lookup_table,
|
|
(vector_type)lookup_table, this->value);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_32(const simd8<L> lookup_table_lo,
|
|
const simd8<L> lookup_table_hi) const {
|
|
return (vector_type)vec_perm(lookup_table_lo.value, lookup_table_hi.value,
|
|
this->value);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
|
|
using Self = simd8<uint8_t>;
|
|
|
|
simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdutf_really_inline simd8(const vector_type _value)
|
|
: base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8((vector_type){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<uint8_t>
|
|
repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
|
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
|
|
uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
|
|
uint8_t v15) {
|
|
return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return move_mask_u8(this->value) == 0;
|
|
}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd8(simd8<T> other)
|
|
: simd8(vector_type(other.value)) {}
|
|
|
|
template <int N>
|
|
simdutf_really_inline Self prev(const Self prev_chunk) const {
|
|
return prev_aux<N>(prev_chunk.value);
|
|
}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t>
|
|
saturating_sub(const simd8<uint8_t> other) const {
|
|
return (vector_type)vec_subs(this->value, (vector_type)other);
|
|
}
|
|
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t>
|
|
lt_bits(const simd8<uint8_t> other) const {
|
|
return other.saturating_sub(*this);
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline bool bits_not_set_anywhere() const {
|
|
return vec_all_eq(this->value, (vector_type)vec_splats(0));
|
|
}
|
|
|
|
simdutf_really_inline bool any_bits_set_anywhere() const {
|
|
return !bits_not_set_anywhere();
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
|
|
return simd8<uint8_t>(
|
|
(vector_type)vec_sr(this->value, (vector_type)vec_splat_u8(N)));
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
|
|
return simd8<uint8_t>(
|
|
(vector_type)vec_sl(this->value, (vector_type)vec_splat_u8(N)));
|
|
}
|
|
|
|
void dump() const {
|
|
uint8_t tmp[16];
|
|
store(tmp);
|
|
for (int i = 0; i < 16; i++) {
|
|
if (i == 0) {
|
|
printf("[%02x", tmp[i]);
|
|
} else if (i == 15) {
|
|
printf(" %02x]", tmp[i]);
|
|
} else {
|
|
printf(" %02x", tmp[i]);
|
|
}
|
|
}
|
|
putchar('\n');
|
|
}
|
|
|
|
void dump_ascii() const {
|
|
uint8_t tmp[16];
|
|
store(tmp);
|
|
for (int i = 0; i < 16; i++) {
|
|
if (i == 0) {
|
|
printf("[%c", tmp[i]);
|
|
} else if (i == 15) {
|
|
printf("%c]", tmp[i]);
|
|
} else {
|
|
printf("%c", tmp[i]);
|
|
}
|
|
}
|
|
putchar('\n');
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdutf_really_inline simd8(const vector_type _value)
|
|
: base8_numeric<int8_t>(_value) {}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd8(simd8<T> other)
|
|
: simd8(vector_type(other.value)) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
|
|
|
|
simdutf_really_inline operator simd8<uint8_t>() const;
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<int8_t>
|
|
saturating_add(const simd8<int8_t> other) const {
|
|
return (vector_type)vec_adds(this->value, other.value);
|
|
}
|
|
|
|
void dump() const {
|
|
int8_t tmp[16];
|
|
store(tmp);
|
|
for (int i = 0; i < 16; i++) {
|
|
if (i == 0) {
|
|
printf("[%02x", tmp[i]);
|
|
} else if (i == 15) {
|
|
printf("%02x]", tmp[i]);
|
|
} else {
|
|
printf("%02x", tmp[i]);
|
|
}
|
|
}
|
|
putchar('\n');
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
simd8<bool> operator==(const simd8<T> a, const simd8<T> b) {
|
|
return vec_cmpeq(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd8<bool> operator!=(const simd8<T> a, const simd8<T> b) {
|
|
return vec_cmpne(a.value, b.value);
|
|
}
|
|
|
|
template <typename T> simd8<T> operator&(const simd8<T> a, const simd8<T> b) {
|
|
return vec_and(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd8<T> operator&(const simd8<T> a, U b) {
|
|
return vec_and(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
template <typename T> simd8<T> operator|(const simd8<T> a, const simd8<T> b) {
|
|
return vec_or(a.value, b.value);
|
|
}
|
|
|
|
template <typename T> simd8<T> operator^(const simd8<T> a, const simd8<T> b) {
|
|
return vec_xor(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd8<T> operator^(const simd8<T> a, U b) {
|
|
return vec_xor(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
template <typename T> simd8<T> operator+(const simd8<T> a, const simd8<T> b) {
|
|
return vec_add(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd8<T> operator+(const simd8<T> a, U b) {
|
|
return vec_add(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
|
|
return (simd8<uint8_t>::vector_type)value;
|
|
}
|
|
|
|
template <typename T>
|
|
simd8<bool> operator<(const simd8<T> a, const simd8<T> b) {
|
|
return vec_cmplt(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd8<bool> operator>(const simd8<T> a, const simd8<T> b) {
|
|
return vec_cmpgt(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd8<bool> operator>=(const simd8<T> a, const simd8<T> b) {
|
|
return vec_cmpge(a.value, b.value);
|
|
}
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static constexpr size_t ELEMENTS = simd8<T>::ELEMENTS;
|
|
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"PPC64 kernel should use four registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
simd8x64(simd8x64<T> &&) = default;
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
|
|
const simd8<T> chunk2, const simd8<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd8x64(const T *ptr)
|
|
: chunks{simd8<T>::load(ptr),
|
|
simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + ELEMENTS * 0);
|
|
this->chunks[1].store(ptr + ELEMENTS * 1);
|
|
this->chunks[2].store(ptr + ELEMENTS * 2);
|
|
this->chunks[3].store(ptr + ELEMENTS * 3);
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
this->chunks[2] |= other.chunks[2];
|
|
this->chunks[3] |= other.chunks[3];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 1);
|
|
this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 2);
|
|
this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
|
|
this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
|
|
this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r1 = this->chunks[1].to_bitmask();
|
|
uint64_t r2 = this->chunks[2].to_bitmask();
|
|
uint64_t r3 = this->chunks[3].to_bitmask();
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
|
|
this->chunks[2] < mask, this->chunks[3] < mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
|
|
this->chunks[2] > mask, this->chunks[3] > mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(simd8<uint8_t>(this->chunks[0]) >= mask,
|
|
simd8<uint8_t>(this->chunks[1]) >= mask,
|
|
simd8<uint8_t>(this->chunks[2]) >= mask,
|
|
simd8<uint8_t>(this->chunks[3]) >= mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
void dump() const {
|
|
puts("");
|
|
for (int i = 0; i < 4; i++) {
|
|
printf("chunk[%d] = ", i);
|
|
this->chunks[i].dump();
|
|
}
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
simdutf_really_inline simd8<uint8_t> avg(const simd8<uint8_t> a,
|
|
const simd8<uint8_t> b) {
|
|
return vec_avg(a.value, b.value);
|
|
}
|
|
/* end file src/simdutf/ppc64/simd8-inl.h */
|
|
/* begin file src/simdutf/ppc64/simd16-inl.h */
|
|
// file included directly
|
|
|
|
template <typename T> struct simd16;
|
|
|
|
template <typename T> struct base16 {
|
|
using vector_type = vector_u16_type_for_element<T>;
|
|
static const int SIZE = sizeof(vector_type);
|
|
static const int ELEMENTS = sizeof(vector_type) / sizeof(T);
|
|
|
|
vector_type value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base16() : value{vector_type()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base16(const vector_type _value) : value{_value} {}
|
|
|
|
void dump() const {
|
|
uint16_t tmp[8];
|
|
vec_xst(value, 0, reinterpret_cast<vector_type *>(tmp));
|
|
for (int i = 0; i < 8; i++) {
|
|
if (i == 0) {
|
|
printf("[%04x", tmp[i]);
|
|
} else if (i == 8 - 1) {
|
|
printf(" %04x]", tmp[i]);
|
|
} else {
|
|
printf(" %04x", tmp[i]);
|
|
}
|
|
}
|
|
putchar('\n');
|
|
}
|
|
};
|
|
|
|
// Forward declaration
|
|
template <typename> struct simd16;
|
|
|
|
template <typename T>
|
|
simd16<bool> operator==(const simd16<T> a, const simd16<T> b);
|
|
|
|
template <typename T, typename U>
|
|
simd16<bool> operator==(const simd16<T> a, U b);
|
|
|
|
template <typename T> simd16<T> operator&(const simd16<T> a, const simd16<T> b);
|
|
|
|
template <typename T> simd16<T> operator|(const simd16<T> a, const simd16<T> b);
|
|
|
|
template <typename T, typename U> simd16<T> operator|(const simd16<T> a, U b);
|
|
|
|
template <typename T, typename U> simd16<T> operator^(const simd16<T> a, U b);
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd16<bool> : base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) {
|
|
return (vector_type)vec_splats(uint16_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd16() : base16() {}
|
|
|
|
simdutf_really_inline simd16(const vector_type _value)
|
|
: base16<bool>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline uint16_t to_bitmask() const {
|
|
return move_mask_u8(value);
|
|
}
|
|
|
|
simdutf_really_inline bool any() const {
|
|
const auto tmp = vec_u64_t(value);
|
|
|
|
return tmp[0] || tmp[1]; // Note: logical or, not binary one
|
|
}
|
|
|
|
simdutf_really_inline bool is_zero() const {
|
|
const auto tmp = vec_u64_t(value);
|
|
|
|
return (tmp[0] | tmp[1]) == 0;
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool> &operator|=(const simd16<bool> rhs) {
|
|
value = vec_or(this->value, rhs.value);
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
template <typename T> struct base16_numeric : base16<T> {
|
|
using vector_type = typename base16<T>::vector_type;
|
|
|
|
static simdutf_really_inline simd16<T> splat(T _value) {
|
|
return vec_splats(_value);
|
|
}
|
|
|
|
static simdutf_really_inline simd16<T> zero() { return splat(0); }
|
|
|
|
template <typename U>
|
|
static simdutf_really_inline simd16<T> load(const U *ptr) {
|
|
return vec_xl(0, reinterpret_cast<const T *>(ptr));
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
simdutf_really_inline base16_numeric(const vector_type _value)
|
|
: base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
template <typename U> simdutf_really_inline void store(U *dst) const {
|
|
#if defined(__clang__)
|
|
return vec_xst(this->value, 0, reinterpret_cast<T *>(dst));
|
|
#else
|
|
return vec_xst(this->value, 0, reinterpret_cast<vector_type *>(dst));
|
|
#endif // defined(__clang__)
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const {
|
|
return vec_xor(this->value, vec_splats(T(0xffff)));
|
|
}
|
|
};
|
|
|
|
// Signed code units
|
|
template <> struct simd16<int16_t> : base16_numeric<int16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
|
|
simdutf_really_inline simd16(const vector_type _value)
|
|
: base16_numeric<int16_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline operator simd16<uint16_t>() const;
|
|
};
|
|
|
|
// Unsigned code units
|
|
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
simdutf_really_inline simd16(const vector_type _value)
|
|
: base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const char16_t *values)
|
|
: simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return vec_all_lt(value, vec_splats(uint16_t(128)));
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16<uint16_t>
|
|
max_val(const simd16<uint16_t> other) const {
|
|
return vec_max(this->value, other.value);
|
|
}
|
|
simdutf_really_inline simd16<uint16_t>
|
|
min_val(const simd16<uint16_t> other) const {
|
|
return vec_min(this->value, other.value);
|
|
}
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<bool>
|
|
operator<=(const simd16<uint16_t> other) const {
|
|
return other.max_val(*this) == other;
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool>
|
|
operator>=(const simd16<uint16_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool>
|
|
operator<(const simd16<uint16_t> other) const {
|
|
return vec_cmplt(value, other.value);
|
|
}
|
|
|
|
// Bit-specific operations
|
|
template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
|
|
return vec_sr(value, vec_splats(uint16_t(N)));
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
|
|
return vec_sl(value, vec_splats(uint16_t(N)));
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
return vec_revb(value);
|
|
}
|
|
|
|
// Pack with the unsigned saturation of two uint16_t code units into single
|
|
// uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
|
|
const simd16<uint16_t> &v1) {
|
|
return vec_packs(v0.value, v1.value);
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
simd16<bool> operator==(const simd16<T> a, const simd16<T> b) {
|
|
return vec_cmpeq(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U>
|
|
simd16<bool> operator==(const simd16<T> a, U b) {
|
|
return vec_cmpeq(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
template <typename T>
|
|
simd16<T> operator&(const simd16<T> a, const simd16<T> b) {
|
|
return vec_and(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd16<T> operator&(const simd16<T> a, U b) {
|
|
return vec_and(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
template <typename T>
|
|
simd16<T> operator|(const simd16<T> a, const simd16<T> b) {
|
|
return vec_or(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd16<T> operator|(const simd16<T> a, U b) {
|
|
return vec_or(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
template <typename T>
|
|
simd16<T> operator^(const simd16<T> a, const simd16<T> b) {
|
|
return vec_xor(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd16<T> operator^(const simd16<T> a, U b) {
|
|
return vec_xor(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
|
|
return (vec_u16_t)(value);
|
|
}
|
|
|
|
template <typename T> struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"AltiVec kernel should use four registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
|
|
simd16x32<T> &
|
|
operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline
|
|
simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
|
|
const simd16<T> chunk2, const simd16<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd16x32(const T *ptr)
|
|
: chunks{simd16<T>::load(ptr),
|
|
simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd16<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
|
|
this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
|
|
this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r1 = this->chunks[1].to_bitmask();
|
|
uint64_t r2 = this->chunks[2].to_bitmask();
|
|
uint64_t r3 = this->chunks[3].to_bitmask();
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
this->chunks[2] = this->chunks[2].swap_bytes();
|
|
this->chunks[3] = this->chunks[3].swap_bytes();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask, this->chunks[3] <= mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
|
|
const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
|
|
(this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
|
|
(this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
|
|
(this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
/* end file src/simdutf/ppc64/simd16-inl.h */
|
|
/* begin file src/simdutf/ppc64/simd32-inl.h */
|
|
// file included directly
|
|
|
|
template <typename T> struct simd32;
|
|
|
|
template <typename T> struct base32 {
|
|
using vector_type = vector_u32_type_for_element<T>;
|
|
static const int SIZE = sizeof(vector_type);
|
|
static const int ELEMENTS = sizeof(vector_type) / sizeof(T);
|
|
|
|
vector_type value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base32() : value{vector_type()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base32(const vector_type _value) : value{_value} {}
|
|
|
|
// Splat for scalar
|
|
simdutf_really_inline base32(T scalar) : value{vec_splats(scalar)} {}
|
|
|
|
template <typename Pointer>
|
|
simdutf_really_inline base32(const Pointer *ptr)
|
|
: base32(vec_xl(0, reinterpret_cast<const T *>(ptr))) {}
|
|
|
|
// Store to array
|
|
template <typename U> simdutf_really_inline void store(U *dst) const {
|
|
#if defined(__clang__)
|
|
return vec_xst(this->value, 0, reinterpret_cast<T *>(dst));
|
|
#else
|
|
return vec_xst(this->value, 0, reinterpret_cast<vector_type *>(dst));
|
|
#endif // defined(__clang__)
|
|
}
|
|
|
|
void dump(const char *name = nullptr) const {
|
|
if (name != nullptr) {
|
|
printf("%-10s = ", name);
|
|
}
|
|
|
|
uint32_t tmp[4];
|
|
vec_xst(value, 0, reinterpret_cast<vector_type *>(tmp));
|
|
for (int i = 0; i < 4; i++) {
|
|
if (i == 0) {
|
|
printf("[%08x", tmp[i]);
|
|
} else if (i == 4 - 1) {
|
|
printf(" %08x]", tmp[i]);
|
|
} else {
|
|
printf(" %08x", tmp[i]);
|
|
}
|
|
}
|
|
putchar('\n');
|
|
}
|
|
};
|
|
|
|
template <typename T> struct base32_numeric : base32<T> {
|
|
using super = base32<T>;
|
|
using vector_type = typename super::vector_type;
|
|
|
|
static simdutf_really_inline simd32<T> splat(T _value) {
|
|
return vec_splats(_value);
|
|
}
|
|
|
|
static simdutf_really_inline simd32<T> zero() { return splat(0); }
|
|
|
|
template <typename U>
|
|
static simdutf_really_inline simd32<T> load(const U *values) {
|
|
return vec_xl(0, reinterpret_cast<const T *>(values));
|
|
}
|
|
|
|
simdutf_really_inline base32_numeric() : base32<T>() {}
|
|
|
|
simdutf_really_inline base32_numeric(const vector_type _value)
|
|
: base32<T>(_value) {}
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd32<T> operator+(const simd32<T> other) const {
|
|
return vec_add(this->value, other.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<T> operator-(const simd32<T> other) const {
|
|
return vec_sub(this->value, other.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<T> &operator+=(const simd32<T> other) {
|
|
*this = *this + other;
|
|
return *static_cast<simd32<T> *>(this);
|
|
}
|
|
|
|
simdutf_really_inline simd32<T> &operator-=(const simd32<T> other) {
|
|
*this = *this - other;
|
|
return *static_cast<simd32<T> *>(this);
|
|
}
|
|
};
|
|
|
|
// Forward declaration
|
|
template <typename> struct simd32;
|
|
|
|
template <typename T>
|
|
simd32<bool> operator==(const simd32<T> a, const simd32<T> b);
|
|
|
|
template <typename T>
|
|
simd32<bool> operator!=(const simd32<T> a, const simd32<T> b);
|
|
|
|
template <typename T>
|
|
simd32<bool> operator>(const simd32<T> a, const simd32<T> b);
|
|
|
|
template <typename T> simd32<bool> operator==(const simd32<T> a, T b);
|
|
|
|
template <typename T> simd32<bool> operator!=(const simd32<T> a, T b);
|
|
|
|
template <typename T> simd32<T> operator&(const simd32<T> a, const simd32<T> b);
|
|
|
|
template <typename T> simd32<T> operator|(const simd32<T> a, const simd32<T> b);
|
|
|
|
template <typename T> simd32<T> operator^(const simd32<T> a, const simd32<T> b);
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd32<bool> : base32<bool> {
|
|
static simdutf_really_inline simd32<bool> splat(bool _value) {
|
|
return (vector_type)vec_splats(uint32_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd32(const vector_type _value)
|
|
: base32<bool>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd32(bool _value) : base32<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline uint16_t to_bitmask() const {
|
|
return move_mask_u8(value);
|
|
}
|
|
|
|
simdutf_really_inline bool any() const {
|
|
const vec_u64_t tmp = (vec_u64_t)value;
|
|
|
|
return tmp[0] || tmp[1]; // Note: logical or, not binary one
|
|
}
|
|
|
|
simdutf_really_inline bool is_zero() const {
|
|
const vec_u64_t tmp = (vec_u64_t)value;
|
|
|
|
return (tmp[0] | tmp[1]) == 0;
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator~() const {
|
|
return (vec_bool32_t)vec_xor(this->value, vec_splats(uint32_t(0xffffffff)));
|
|
}
|
|
};
|
|
|
|
// Unsigned code units
|
|
template <> struct simd32<uint32_t> : base32_numeric<uint32_t> {
|
|
simdutf_really_inline simd32() : base32_numeric<uint32_t>() {}
|
|
|
|
simdutf_really_inline simd32(const vector_type _value)
|
|
: base32_numeric<uint32_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd32(uint32_t _value) : simd32(splat(_value)) {}
|
|
|
|
// Array constructor
|
|
simdutf_really_inline simd32(const char32_t *values)
|
|
: simd32(load(reinterpret_cast<const uint32_t *>(values))) {}
|
|
|
|
// Bit-specific operations
|
|
template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
|
|
return vec_sr(value, vec_splats(uint32_t(N)));
|
|
}
|
|
|
|
template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
|
|
return vec_sl(value, vec_splats(uint32_t(N)));
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd32<uint32_t> swap_bytes() const {
|
|
return vec_revb(value);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
return uint64_t(value[0]) + uint64_t(value[1]) + uint64_t(value[2]) +
|
|
uint64_t(value[3]);
|
|
}
|
|
|
|
static simdutf_really_inline simd16<uint16_t>
|
|
pack(const simd32<uint32_t> &v0, const simd32<uint32_t> &v1) {
|
|
return vec_packs(v0.value, v1.value);
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
simd32<bool> operator==(const simd32<T> a, const simd32<T> b) {
|
|
return vec_cmpeq(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<bool> operator!=(const simd32<T> a, const simd32<T> b) {
|
|
return vec_cmpne(a.value, b.value);
|
|
}
|
|
|
|
template <typename T> simd32<bool> operator==(const simd32<T> a, T b) {
|
|
return vec_cmpeq(a.value, vec_splats(b));
|
|
}
|
|
|
|
template <typename T> simd32<bool> operator!=(const simd32<T> a, T b) {
|
|
return vec_cmpne(a.value, vec_splats(b));
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<bool> operator>(const simd32<T> a, const simd32<T> b) {
|
|
return vec_cmpgt(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<bool> operator>=(const simd32<T> a, const simd32<T> b) {
|
|
return vec_cmpge(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<T> operator&(const simd32<T> a, const simd32<T> b) {
|
|
return vec_and(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd32<T> operator&(const simd32<T> a, U b) {
|
|
return vec_and(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<T> operator|(const simd32<T> a, const simd32<T> b) {
|
|
return vec_or(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<T> operator^(const simd32<T> a, const simd32<T> b) {
|
|
return vec_xor(a.value, b.value);
|
|
}
|
|
|
|
template <typename T, typename U> simd32<T> operator^(const simd32<T> a, U b) {
|
|
return vec_xor(a.value, vec_splats(T(b)));
|
|
}
|
|
|
|
template <typename T> simd32<T> max_val(const simd32<T> a, const simd32<T> b) {
|
|
return vec_max(a.value, b.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd32<T> min(const simd32<T> b, const simd32<T> a) {
|
|
return vec_min(a.value, b.value);
|
|
}
|
|
/* end file src/simdutf/ppc64/simd32-inl.h */
|
|
|
|
template <typename T>
|
|
simd8<T> select(const simd8<T> cond, const simd8<T> val_true,
|
|
const simd8<T> val_false) {
|
|
return vec_sel(val_false.value, val_true.value, cond.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd8<T> select(const T cond, const simd8<T> val_true,
|
|
const simd8<T> val_false) {
|
|
return vec_sel(val_false.value, val_true.value, vec_splats(cond));
|
|
}
|
|
|
|
template <typename T>
|
|
simd16<T> select(const simd16<T> cond, const simd16<T> val_true,
|
|
const simd16<T> val_false) {
|
|
return vec_sel(val_false.value, val_true.value, cond.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd16<T> select(const T cond, const simd16<T> val_true,
|
|
const simd16<T> val_false) {
|
|
return vec_sel(val_false.value, val_true.value, vec_splats(cond));
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<T> select(const simd32<T> cond, const simd32<T> val_true,
|
|
const simd32<T> val_false) {
|
|
return vec_sel(val_false.value, val_true.value, cond.value);
|
|
}
|
|
|
|
template <typename T>
|
|
simd32<T> select(const T cond, const simd32<T> val_true,
|
|
const simd32<T> val_false) {
|
|
return vec_sel(val_false.value, val_true.value, vec_splats(cond));
|
|
}
|
|
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u16 = simd16<uint16_t>;
|
|
using vector_u32 = simd32<uint32_t>;
|
|
using vector_i8 = simd8<int8_t>;
|
|
|
|
simdutf_really_inline vector_u8 as_vector_u8(const vector_u16 v) {
|
|
return vector_u8::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u8 as_vector_u8(const vector_u32 v) {
|
|
return vector_u8::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u8 as_vector_u8(const vector_i8 v) {
|
|
return vector_u8::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u8 as_vector_u8(const simd16<bool> v) {
|
|
return vector_u8::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_i8 as_vector_i8(const vector_u8 v) {
|
|
return vector_i8::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u16 as_vector_u16(const vector_u8 v) {
|
|
return vector_u16::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u16 as_vector_u16(const simd16<bool> v) {
|
|
return vector_u16::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u32 as_vector_u32(const vector_u8 v) {
|
|
return vector_u32::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u32 as_vector_u32(const vector_u16 v) {
|
|
return vector_u32::vector_type(v.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u32 max(vector_u32 a, vector_u32 b) {
|
|
return vec_max(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline vector_u32 max(vector_u32 a, vector_u32 b, vector_u32 c) {
|
|
return max(max(a, b), c);
|
|
}
|
|
|
|
simdutf_really_inline vector_u32 sum4bytes(vector_u8 bytes, vector_u32 acc) {
|
|
return vec_sum4s(bytes.value, acc.value);
|
|
}
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_PPC64_SIMD_INPUT_H
|
|
/* end file src/simdutf/ppc64/simd.h */
|
|
|
|
/* begin file src/simdutf/ppc64/end.h */
|
|
/* end file src/simdutf/ppc64/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_PPC64
|
|
|
|
#endif // SIMDUTF_PPC64_H
|
|
/* end file src/simdutf/ppc64.h */
|
|
/* begin file src/simdutf/rvv.h */
|
|
#ifndef SIMDUTF_RVV_H
|
|
#define SIMDUTF_RVV_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "rvv.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_RVV SIMDUTF_IS_RVV
|
|
|
|
#ifndef SIMDUTF_IMPLEMENTATION_RVV
|
|
#define SIMDUTF_IMPLEMENTATION_RVV \
|
|
(SIMDUTF_CAN_ALWAYS_RUN_RVV || \
|
|
(SIMDUTF_IS_RISCV64 && SIMDUTF_HAS_RVV_INTRINSICS && \
|
|
SIMDUTF_HAS_RVV_TARGET_REGION))
|
|
#endif
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_RVV
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_RVV
|
|
#define SIMDUTF_TARGET_RVV
|
|
#else
|
|
#define SIMDUTF_TARGET_RVV SIMDUTF_TARGET_REGION("arch=+v")
|
|
#endif
|
|
#if !SIMDUTF_IS_ZVBB && SIMDUTF_HAS_ZVBB_INTRINSICS
|
|
#define SIMDUTF_TARGET_ZVBB SIMDUTF_TARGET_REGION("arch=+v,+zvbb")
|
|
#endif
|
|
|
|
namespace simdutf {
|
|
namespace rvv {} // namespace rvv
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/rvv/implementation.h */
|
|
#ifndef SIMDUTF_RVV_IMPLEMENTATION_H
|
|
#define SIMDUTF_RVV_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace rvv {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
} // namespace
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("rvv", "RISC-V Vector Extension",
|
|
internal::instruction_set::RVV),
|
|
_supports_zvbb(internal::detect_supported_architectures() &
|
|
internal::instruction_set::ZVBB) {}
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
private:
|
|
const bool _supports_zvbb;
|
|
|
|
#if SIMDUTF_IS_ZVBB
|
|
bool supports_zvbb() const { return true; }
|
|
#elif SIMDUTF_HAS_ZVBB_INTRINSICS
|
|
bool supports_zvbb() const { return _supports_zvbb; }
|
|
#else
|
|
bool supports_zvbb() const { return false; }
|
|
#endif
|
|
};
|
|
|
|
} // namespace rvv
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_RVV_IMPLEMENTATION_H
|
|
/* end file src/simdutf/rvv/implementation.h */
|
|
/* begin file src/simdutf/rvv/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "rvv"
|
|
// #define SIMDUTF_IMPLEMENTATION rvv
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_RVV
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_RVV
|
|
#endif
|
|
/* end file src/simdutf/rvv/begin.h */
|
|
/* begin file src/simdutf/rvv/intrinsics.h */
|
|
#ifndef SIMDUTF_RVV_INTRINSICS_H
|
|
#define SIMDUTF_RVV_INTRINSICS_H
|
|
|
|
|
|
#include <riscv_vector.h>
|
|
|
|
#if __riscv_v_intrinsic >= 1000000 || __GCC__ >= 14
|
|
#define simdutf_vrgather_u8m1x2(tbl, idx) \
|
|
__riscv_vcreate_v_u8m1_u8m2( \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), \
|
|
__riscv_vsetvlmax_e8m1()), \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), \
|
|
__riscv_vsetvlmax_e8m1()));
|
|
|
|
#define simdutf_vrgather_u8m1x4(tbl, idx) \
|
|
__riscv_vcreate_v_u8m1_u8m4( \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), \
|
|
__riscv_vsetvlmax_e8m1()), \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1), \
|
|
__riscv_vsetvlmax_e8m1()), \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), \
|
|
__riscv_vsetvlmax_e8m1()), \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), \
|
|
__riscv_vsetvlmax_e8m1()));
|
|
#else
|
|
// This has worse codegen on gcc
|
|
#define simdutf_vrgather_u8m1x2(tbl, idx) \
|
|
__riscv_vset_v_u8m1_u8m2( \
|
|
__riscv_vlmul_ext_v_u8m1_u8m2(__riscv_vrgather_vv_u8m1( \
|
|
tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), \
|
|
1, \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), \
|
|
__riscv_vsetvlmax_e8m1()))
|
|
|
|
#define simdutf_vrgather_u8m1x4(tbl, idx) \
|
|
__riscv_vset_v_u8m1_u8m4( \
|
|
__riscv_vset_v_u8m1_u8m4( \
|
|
__riscv_vset_v_u8m1_u8m4( \
|
|
__riscv_vlmul_ext_v_u8m1_u8m4(__riscv_vrgather_vv_u8m1( \
|
|
tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), \
|
|
__riscv_vsetvlmax_e8m1())), \
|
|
1, \
|
|
__riscv_vrgather_vv_u8m1(tbl, \
|
|
__riscv_vget_v_u8m4_u8m1(idx, 1), \
|
|
__riscv_vsetvlmax_e8m1())), \
|
|
2, \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), \
|
|
__riscv_vsetvlmax_e8m1())), \
|
|
3, \
|
|
__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), \
|
|
__riscv_vsetvlmax_e8m1()))
|
|
#endif
|
|
|
|
/* Zvbb adds dedicated support for endianness swaps with vrev8, but if we can't
|
|
* use that, we have to emulate it with the standard V extension.
|
|
* Using LMUL=1 vrgathers could be faster than the srl+macc variant, but that
|
|
* would increase register pressure, and vrgather implementations performance
|
|
* varies a lot. */
|
|
enum class simdutf_ByteFlip { NONE, V, ZVBB };
|
|
|
|
template <simdutf_ByteFlip method>
|
|
simdutf_really_inline static uint16_t simdutf_byteflip(uint16_t v) {
|
|
if (method != simdutf_ByteFlip::NONE)
|
|
return (uint16_t)((v * 1u) << 8 | (v * 1u) >> 8);
|
|
return v;
|
|
}
|
|
|
|
#ifdef SIMDUTF_TARGET_ZVBB
|
|
SIMDUTF_UNTARGET_REGION
|
|
SIMDUTF_TARGET_ZVBB
|
|
#endif
|
|
|
|
template <simdutf_ByteFlip method>
|
|
simdutf_really_inline static vuint16m1_t simdutf_byteflip(vuint16m1_t v,
|
|
size_t vl) {
|
|
#if SIMDUTF_HAS_ZVBB_INTRINSICS
|
|
if (method == simdutf_ByteFlip::ZVBB)
|
|
return __riscv_vrev8_v_u16m1(v, vl);
|
|
#endif
|
|
if (method == simdutf_ByteFlip::V)
|
|
return __riscv_vmacc_vx_u16m1(__riscv_vsrl_vx_u16m1(v, 8, vl), 0x100, v,
|
|
vl);
|
|
return v;
|
|
}
|
|
|
|
template <simdutf_ByteFlip method>
|
|
simdutf_really_inline static vuint16m2_t simdutf_byteflip(vuint16m2_t v,
|
|
size_t vl) {
|
|
#if SIMDUTF_HAS_ZVBB_INTRINSICS
|
|
if (method == simdutf_ByteFlip::ZVBB)
|
|
return __riscv_vrev8_v_u16m2(v, vl);
|
|
#endif
|
|
if (method == simdutf_ByteFlip::V)
|
|
return __riscv_vmacc_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 8, vl), 0x100, v,
|
|
vl);
|
|
return v;
|
|
}
|
|
|
|
template <simdutf_ByteFlip method>
|
|
simdutf_really_inline static vuint16m4_t simdutf_byteflip(vuint16m4_t v,
|
|
size_t vl) {
|
|
#if SIMDUTF_HAS_ZVBB_INTRINSICS
|
|
if (method == simdutf_ByteFlip::ZVBB)
|
|
return __riscv_vrev8_v_u16m4(v, vl);
|
|
#endif
|
|
if (method == simdutf_ByteFlip::V)
|
|
return __riscv_vmacc_vx_u16m4(__riscv_vsrl_vx_u16m4(v, 8, vl), 0x100, v,
|
|
vl);
|
|
return v;
|
|
}
|
|
|
|
template <simdutf_ByteFlip method>
|
|
simdutf_really_inline static vuint16m8_t simdutf_byteflip(vuint16m8_t v,
|
|
size_t vl) {
|
|
#if SIMDUTF_HAS_ZVBB_INTRINSICS
|
|
if (method == simdutf_ByteFlip::ZVBB)
|
|
return __riscv_vrev8_v_u16m8(v, vl);
|
|
#endif
|
|
if (method == simdutf_ByteFlip::V)
|
|
return __riscv_vmacc_vx_u16m8(__riscv_vsrl_vx_u16m8(v, 8, vl), 0x100, v,
|
|
vl);
|
|
return v;
|
|
}
|
|
|
|
#ifdef SIMDUTF_TARGET_ZVBB
|
|
SIMDUTF_UNTARGET_REGION
|
|
SIMDUTF_TARGET_RVV
|
|
#endif
|
|
|
|
#endif // SIMDUTF_RVV_INTRINSICS_H
|
|
/* end file src/simdutf/rvv/intrinsics.h */
|
|
/* begin file src/simdutf/rvv/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_RVV
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* end file src/simdutf/rvv/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_RVV
|
|
|
|
#endif // SIMDUTF_RVV_H
|
|
/* end file src/simdutf/rvv.h */
|
|
/* begin file src/simdutf/lsx.h */
|
|
#ifndef SIMDUTF_LSX_H
|
|
#define SIMDUTF_LSX_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "lsx.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
#ifndef SIMDUTF_IMPLEMENTATION_LSX
|
|
#define SIMDUTF_IMPLEMENTATION_LSX (SIMDUTF_IS_LSX)
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LSX && SIMDUTF_IS_LSX
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_LSX 1
|
|
#else
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_LSX 0
|
|
#endif
|
|
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_LSX
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for LoongArch SX.
|
|
*/
|
|
namespace lsx {} // namespace lsx
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/lsx/implementation.h */
|
|
#ifndef SIMDUTF_LSX_IMPLEMENTATION_H
|
|
#define SIMDUTF_LSX_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("lsx", "LOONGARCH SX",
|
|
internal::instruction_set::LSX) {}
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
};
|
|
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_LSX_IMPLEMENTATION_H
|
|
/* end file src/simdutf/lsx/implementation.h */
|
|
|
|
/* begin file src/simdutf/lsx/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "lsx"
|
|
// #define SIMDUTF_IMPLEMENTATION lsx
|
|
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
|
|
/* end file src/simdutf/lsx/begin.h */
|
|
|
|
// Declarations
|
|
/* begin file src/simdutf/lsx/intrinsics.h */
|
|
#ifndef SIMDUTF_LSX_INTRINSICS_H
|
|
#define SIMDUTF_LSX_INTRINSICS_H
|
|
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <lsxintrin.h>
|
|
|
|
/*
|
|
Encoding of argument for LoongArch64 xvldi instruction. See:
|
|
https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm
|
|
|
|
1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
|
|
|
|
2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
|
|
|
|
3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
|
|
|
|
4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
|
|
|
|
5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
|
|
|
|
6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
|
|
|
|
7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all
|
|
lanes
|
|
|
|
8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to
|
|
all lanes
|
|
|
|
9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
|
|
|
|
10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast
|
|
the result as 64-bit elements to all lanes
|
|
*/
|
|
|
|
namespace vldi {
|
|
|
|
template <uint16_t v> class const_u16 {
|
|
constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
|
|
constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
|
|
|
|
constexpr static bool is_case5 = uint16_t(b0) == v;
|
|
constexpr static bool is_case6 = (uint16_t(b1) << 8) == v;
|
|
constexpr static bool is_case9 = (b0 == b1);
|
|
constexpr static bool is_case10 =
|
|
((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00));
|
|
|
|
public:
|
|
constexpr static uint16_t operation = is_case5 ? 0b10100
|
|
: is_case6 ? 0b10101
|
|
: is_case9 ? 0b11000
|
|
: is_case10 ? 0x11001
|
|
: 0xffff;
|
|
|
|
constexpr static uint16_t byte =
|
|
is_case5 ? b0
|
|
: is_case6 ? b1
|
|
: is_case9 ? b0
|
|
: is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00))
|
|
: 0xffff;
|
|
|
|
constexpr static int value = int((operation << 8) | byte) - 8192;
|
|
constexpr static bool valid = operation != 0xffff;
|
|
};
|
|
|
|
template <uint32_t v> class const_u32 {
|
|
constexpr static const uint8_t b0 = (v & 0xff);
|
|
constexpr static const uint8_t b1 = ((v >> 8) & 0xff);
|
|
constexpr static const uint8_t b2 = ((v >> 16) & 0xff);
|
|
constexpr static const uint8_t b3 = ((v >> 24) & 0xff);
|
|
|
|
constexpr static bool is_case1 = (uint32_t(b0) == v);
|
|
constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v);
|
|
constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v);
|
|
constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v);
|
|
constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0);
|
|
constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0);
|
|
constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff);
|
|
constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff);
|
|
constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3);
|
|
constexpr static bool is_case10 =
|
|
((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
|
|
((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00));
|
|
|
|
public:
|
|
constexpr static uint16_t operation = is_case1 ? 0b10000
|
|
: is_case2 ? 0b10001
|
|
: is_case3 ? 0b10010
|
|
: is_case4 ? 0b10011
|
|
: is_case5 ? 0b10100
|
|
: is_case6 ? 0b10101
|
|
: is_case7 ? 0b10110
|
|
: is_case8 ? 0b10111
|
|
: is_case9 ? 0b11000
|
|
: is_case10 ? 0b11001
|
|
: 0xffff;
|
|
|
|
constexpr static uint16_t byte =
|
|
is_case1 ? b0
|
|
: is_case2 ? b1
|
|
: is_case3 ? b2
|
|
: is_case4 ? b3
|
|
: is_case5 ? b0
|
|
: is_case6 ? b1
|
|
: is_case7 ? b1
|
|
: is_case8 ? b2
|
|
: is_case9 ? b0
|
|
: is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) |
|
|
(b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00))
|
|
: 0xffff;
|
|
|
|
constexpr static int value = int((operation << 8) | byte) - 8192;
|
|
constexpr static bool valid = operation != 0xffff;
|
|
};
|
|
|
|
template <uint64_t v> class const_u64 {
|
|
constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
|
|
constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
|
|
constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff);
|
|
constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff);
|
|
constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff);
|
|
constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff);
|
|
constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff);
|
|
constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff);
|
|
|
|
constexpr static bool is_case10 =
|
|
((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
|
|
((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) &&
|
|
((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) &&
|
|
((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00));
|
|
|
|
public:
|
|
constexpr static bool is_32bit =
|
|
((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value;
|
|
constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation;
|
|
constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte;
|
|
|
|
constexpr static uint16_t operation = is_32bit ? op_32bit
|
|
: is_case10 ? 0x11001
|
|
: 0xffff;
|
|
|
|
constexpr static uint16_t byte =
|
|
is_32bit ? byte_32bit
|
|
: is_case10
|
|
? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) |
|
|
(b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) |
|
|
(b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00))
|
|
: 0xffff;
|
|
|
|
constexpr static int value = int((operation << 8) | byte) - 8192;
|
|
constexpr static bool valid = operation != 0xffff;
|
|
};
|
|
} // namespace vldi
|
|
|
|
// Uncomment when running under QEMU affected
|
|
// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865
|
|
// Versions <= 9.2.2 are affected, likely anything newer is correct.
|
|
#ifndef QEMU_VLDI_BUG
|
|
// #define QEMU_VLDI_BUG 1
|
|
#endif
|
|
|
|
#ifdef QEMU_VLDI_BUG
|
|
#define lsx_splat_u16(v) __lsx_vreplgr2vr_h(v)
|
|
#define lsx_splat_u32(v) __lsx_vreplgr2vr_w(v)
|
|
#else
|
|
template <uint16_t x> constexpr __m128i lsx_splat_u16_aux() {
|
|
constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512);
|
|
constexpr uint16_t imm10 = is_imm10 ? x : 0;
|
|
constexpr bool is_vldi = vldi::const_u16<x>::valid;
|
|
constexpr int vldi_imm = is_vldi ? vldi::const_u16<x>::value : 0;
|
|
|
|
return is_imm10 ? __lsx_vrepli_h(int16_t(imm10))
|
|
: is_vldi ? __lsx_vldi(vldi_imm)
|
|
: __lsx_vreplgr2vr_h(x);
|
|
}
|
|
|
|
template <uint32_t x> constexpr __m128i lsx_splat_u32_aux() {
|
|
constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512);
|
|
constexpr uint32_t imm10 = is_imm10 ? x : 0;
|
|
constexpr bool is_vldi = vldi::const_u32<x>::valid;
|
|
constexpr int vldi_imm = is_vldi ? vldi::const_u32<x>::value : 0;
|
|
|
|
return is_imm10 ? __lsx_vrepli_w(int32_t(imm10))
|
|
: is_vldi ? __lsx_vldi(vldi_imm)
|
|
: __lsx_vreplgr2vr_w(x);
|
|
}
|
|
|
|
#define lsx_splat_u16(v) lsx_splat_u16_aux<(v)>()
|
|
#define lsx_splat_u32(v) lsx_splat_u32_aux<(v)>()
|
|
#endif // QEMU_VLDI_BUG
|
|
|
|
#endif // SIMDUTF_LSX_INTRINSICS_H
|
|
/* end file src/simdutf/lsx/intrinsics.h */
|
|
/* begin file src/simdutf/lsx/bitmanipulation.h */
|
|
#ifndef SIMDUTF_LSX_BITMANIPULATION_H
|
|
#define SIMDUTF_LSX_BITMANIPULATION_H
|
|
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
|
|
}
|
|
|
|
#if SIMDUTF_NEED_TRAILING_ZEROES
|
|
// simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
|
|
// return __builtin_ctzll(input_num);
|
|
// }
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_LSX_BITMANIPULATION_H
|
|
/* end file src/simdutf/lsx/bitmanipulation.h */
|
|
/* begin file src/simdutf/lsx/simd.h */
|
|
#ifndef SIMDUTF_LSX_SIMD_H
|
|
#define SIMDUTF_LSX_SIMD_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template <typename T> struct simd8;
|
|
|
|
//
|
|
// Base class of simd8<uint8_t> and simd8<bool>, both of which use __m128i
|
|
// internally.
|
|
//
|
|
template <typename T, typename Mask = simd8<bool>> struct base_u8 {
|
|
__m128i value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline base_u8(const __m128i _value) : value(_value) {}
|
|
simdutf_really_inline operator const __m128i &() const { return this->value; }
|
|
simdutf_really_inline operator __m128i &() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
|
|
return __lsx_vor_v(this->value, other);
|
|
}
|
|
simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
|
|
return __lsx_vand_v(this->value, other);
|
|
}
|
|
simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
|
|
return __lsx_vxor_v(this->value, other);
|
|
}
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
|
|
auto this_cast = static_cast<simd8<T> *>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
|
|
const simd8<T> rhs) {
|
|
return __lsx_vseq_b(lhs, rhs);
|
|
}
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
|
|
__lsx_vbsrl_v(prev_chunk.value, 16 - N));
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base_u8<bool> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) {
|
|
return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd8(const __m128i _value) : base_u8<bool>(_value) {}
|
|
// False constructor
|
|
simdutf_really_inline simd8() : simd8(__lsx_vldi(0)) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
|
|
simdutf_really_inline void store(uint8_t dst[16]) const {
|
|
return __lsx_vst(this->value, dst, 0);
|
|
}
|
|
|
|
simdutf_really_inline uint32_t to_bitmask() const {
|
|
return __lsx_vpickve2gr_wu(__lsx_vmsknz_b(*this), 0);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base_u8<uint8_t> {
|
|
static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
|
|
return __lsx_vreplgr2vr_b(_value);
|
|
}
|
|
static simdutf_really_inline simd8<uint8_t> zero() { return __lsx_vldi(0); }
|
|
static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
|
|
return __lsx_vld(values, 0);
|
|
}
|
|
simdutf_really_inline simd8(const __m128i _value)
|
|
: base_u8<uint8_t>(_value) {}
|
|
// Zero constructor
|
|
simdutf_really_inline simd8() : simd8(zero()) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8((__m128i)v16u8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15}) {}
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<uint8_t>
|
|
repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
|
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
|
|
uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
|
|
uint8_t v15) {
|
|
return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(uint8_t dst[16]) const {
|
|
return __lsx_vst(this->value, dst, 0);
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return __lsx_vsle_bu(other, *this);
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator>(const simd8<uint8_t> other) const {
|
|
return __lsx_vslt_bu(other, *this);
|
|
}
|
|
simdutf_really_inline simd8 &operator-=(const simd8<uint8_t> other) {
|
|
value = __lsx_vsub_b(value, other.value);
|
|
return *this;
|
|
}
|
|
// Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
|
|
// = nonzero. For ARM, returns all 1's.
|
|
simdutf_really_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return simd8<uint8_t>(*this > other);
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
|
|
return __lsx_vslt_bu(__lsx_vldi(0), __lsx_vand_v(this->value, bits));
|
|
}
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return __lsx_vpickve2gr_hu(__lsx_vmskgez_b(this->value), 0) == 0xFFFF;
|
|
}
|
|
|
|
simdutf_really_inline bool any_bits_set_anywhere() const {
|
|
return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(this->value), 0) > 0;
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
|
|
return __lsx_vsrli_b(this->value, N);
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
|
|
return __lsx_vslli_b(this->value, N);
|
|
}
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd8<uint8_t>
|
|
apply_lookup_16_to(const simd8<T> original) const {
|
|
__m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
|
|
return __lsx_vshuf_b(__lsx_vldi(0), *this, simd8<uint8_t>(original_tmp));
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum_bytes() const {
|
|
const auto sum_u16 = __lsx_vhaddw_hu_bu(value, value);
|
|
const auto sum_u32 = __lsx_vhaddw_wu_hu(sum_u16, sum_u16);
|
|
const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32);
|
|
|
|
return uint64_t(__lsx_vpickve2gr_du(sum_u64, 0)) +
|
|
uint64_t(__lsx_vpickve2gr_du(sum_u64, 1));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> {
|
|
__m128i value;
|
|
|
|
static const int SIZE = sizeof(value);
|
|
|
|
static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
|
|
return __lsx_vreplgr2vr_b(_value);
|
|
}
|
|
static simdutf_really_inline simd8<int8_t> zero() { return __lsx_vldi(0); }
|
|
static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
|
|
return __lsx_vld(values, 0);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (match_system(big_endian)) {
|
|
__lsx_vst(__lsx_vilvl_b(zero, (__m128i)this->value),
|
|
reinterpret_cast<uint16_t *>(p), 0);
|
|
__lsx_vst(__lsx_vilvh_b(zero, (__m128i)this->value),
|
|
reinterpret_cast<uint16_t *>(p + 8), 0);
|
|
} else {
|
|
__lsx_vst(__lsx_vilvl_b((__m128i)this->value, zero),
|
|
reinterpret_cast<uint16_t *>(p), 0);
|
|
__lsx_vst(__lsx_vilvh_b((__m128i)this->value, zero),
|
|
reinterpret_cast<uint16_t *>(p + 8), 0);
|
|
}
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
|
|
__m128i zero = __lsx_vldi(0);
|
|
__m128i in16low = __lsx_vilvl_b(zero, (__m128i)this->value);
|
|
__m128i in16high = __lsx_vilvh_b(zero, (__m128i)this->value);
|
|
__m128i in32_0 = __lsx_vilvl_h(zero, in16low);
|
|
__m128i in32_1 = __lsx_vilvh_h(zero, in16low);
|
|
__m128i in32_2 = __lsx_vilvl_h(zero, in16high);
|
|
__m128i in32_3 = __lsx_vilvh_h(zero, in16high);
|
|
__lsx_vst(in32_0, reinterpret_cast<uint32_t *>(p), 0);
|
|
__lsx_vst(in32_1, reinterpret_cast<uint32_t *>(p + 4), 0);
|
|
__lsx_vst(in32_2, reinterpret_cast<uint32_t *>(p + 8), 0);
|
|
__lsx_vst(in32_3, reinterpret_cast<uint32_t *>(p + 12), 0);
|
|
}
|
|
|
|
// In places where the table can be reused, which is most uses in simdutf, it
|
|
// is worth it to do 4 table lookups, as there is no direct zero extension
|
|
// from u8 to u32.
|
|
simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
|
|
const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
|
|
2, 255, 255, 255, 3, 255, 255, 255};
|
|
const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
|
|
6, 255, 255, 255, 7, 255, 255, 255};
|
|
const simd8<uint8_t> tb3{8, 255, 255, 255, 9, 255, 255, 255,
|
|
10, 255, 255, 255, 11, 255, 255, 255};
|
|
const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
|
|
14, 255, 255, 255, 15, 255, 255, 255};
|
|
|
|
// encourage store pairing and interleaving
|
|
const auto shuf1 = this->apply_lookup_16_to(tb1);
|
|
const auto shuf2 = this->apply_lookup_16_to(tb2);
|
|
shuf1.store(reinterpret_cast<int8_t *>(p));
|
|
shuf2.store(reinterpret_cast<int8_t *>(p + 4));
|
|
|
|
const auto shuf3 = this->apply_lookup_16_to(tb3);
|
|
const auto shuf4 = this->apply_lookup_16_to(tb4);
|
|
shuf3.store(reinterpret_cast<int8_t *>(p + 8));
|
|
shuf4.store(reinterpret_cast<int8_t *>(p + 12));
|
|
}
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline simd8(const __m128i _value) : value(_value) {}
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline simd8() : simd8(zero()) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(int8_t dst[16]) const {
|
|
return __lsx_vst(value, dst, 0);
|
|
}
|
|
|
|
simdutf_really_inline operator simd8<uint8_t>() const {
|
|
return ((__m128i)this->value);
|
|
}
|
|
|
|
simdutf_really_inline simd8<int8_t>
|
|
operator|(const simd8<int8_t> other) const {
|
|
return __lsx_vor_v((__m128i)value, (__m128i)other.value);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return (__lsx_vpickve2gr_hu(__lsx_vmskgez_b((__m128i)this->value), 0) ==
|
|
0xffff);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
|
|
return __lsx_vslt_b((__m128i)other.value, (__m128i)value);
|
|
}
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
|
|
return __lsx_vslt_b((__m128i)value, (__m128i)other.value);
|
|
}
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd8<int8_t>
|
|
prev(const simd8<int8_t> prev_chunk) const {
|
|
return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
|
|
__lsx_vbsrl_v(prev_chunk.value, 16 - N));
|
|
}
|
|
|
|
template <typename T>
|
|
simdutf_really_inline simd8<int8_t>
|
|
apply_lookup_16_to(const simd8<T> original) const {
|
|
__m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
|
|
return __lsx_vshuf_b(__lsx_vldi(0), (__m128i)this->value,
|
|
simd8<uint8_t>(original_tmp));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(
|
|
NUM_CHUNKS == 4,
|
|
"LoongArch kernel should use four registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
|
|
const simd8<T> chunk2, const simd8<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd8x64(const T *ptr)
|
|
: chunks{simd8<T>::load(ptr),
|
|
simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
|
|
simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
this->chunks[2] |= other.chunks[2];
|
|
this->chunks[3] |= other.chunks[3];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 1);
|
|
this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 2);
|
|
this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
|
|
this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
|
|
this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
__m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[3]), 6);
|
|
mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[2]), 4));
|
|
mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[1]), 2));
|
|
mask = __lsx_vor_v(mask, __lsx_vmsknz_b(this->chunks[0]));
|
|
return __lsx_vpickve2gr_du(mask, 0);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
|
|
this->chunks[2] < mask, this->chunks[3] < mask)
|
|
.to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
|
|
this->chunks[2] > mask, this->chunks[3] > mask)
|
|
.to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(simd8<uint8_t>(this->chunks[0].value) >= mask,
|
|
simd8<uint8_t>(this->chunks[1].value) >= mask,
|
|
simd8<uint8_t>(this->chunks[2].value) >= mask,
|
|
simd8<uint8_t>(this->chunks[3].value) >= mask)
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
/* begin file src/simdutf/lsx/simd16-inl.h */
|
|
template <typename T> struct simd16;
|
|
|
|
template <typename T, typename Mask = simd16<bool>> struct base_u16 {
|
|
__m128i value;
|
|
static const size_t SIZE = sizeof(value);
|
|
static const size_t ELEMENTS = sizeof(value) / sizeof(T);
|
|
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline base_u16() = default;
|
|
simdutf_really_inline base_u16(const __m128i _value) : value(_value) {}
|
|
// Bit operations
|
|
simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
|
|
return __lsx_vor_v(this->value, other.value);
|
|
}
|
|
simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
|
|
return __lsx_vand_v(this->value, other.value);
|
|
}
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
|
|
const simd16<T> rhs) {
|
|
return __lsx_vseq_h(lhs.value, rhs.value);
|
|
}
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
|
|
__lsx_vbsrl_v(prev_chunk, 16 - N * 2));
|
|
}
|
|
};
|
|
|
|
template <typename T, typename Mask = simd16<bool>>
|
|
struct base16 : base_u16<T> {
|
|
simdutf_really_inline base16() : base_u16<T>() {}
|
|
simdutf_really_inline base16(const __m128i _value) : base_u16<T>(_value) {}
|
|
template <typename Pointer>
|
|
simdutf_really_inline base16(const Pointer *ptr)
|
|
: base16(__lsx_vld(ptr, 0)) {}
|
|
|
|
static const int SIZE = sizeof(base_u16<T>::value);
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
|
|
__lsx_vbsrl_v(prev_chunk, 16 - N * 2));
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd16<bool> : base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) {
|
|
return __lsx_vreplgr2vr_h(uint16_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd16() : base16() {}
|
|
simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}
|
|
};
|
|
|
|
template <typename T> struct base16_numeric : base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) {
|
|
return __lsx_vreplgr2vr_h(_value);
|
|
}
|
|
static simdutf_really_inline simd16<T> zero() { return __lsx_vldi(0); }
|
|
|
|
template <typename Pointer>
|
|
static simdutf_really_inline simd16<T> load(const Pointer values) {
|
|
return __lsx_vld(values, 0);
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric(const __m128i _value)
|
|
: base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const {
|
|
return __lsx_vst(this->value, dst, 0);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
};
|
|
|
|
// Unsigned code unitstemplate<>
|
|
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16(const __m128i _value)
|
|
: base16_numeric<uint16_t>((__m128i)_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t *values)
|
|
: simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
|
|
|
|
// Copy constructor
|
|
simdutf_really_inline simd16(const simd16<bool> mask) : simd16(mask.value) {}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16 &operator+=(const simd16 other) {
|
|
value = __lsx_vadd_h(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
template <unsigned N>
|
|
static simdutf_really_inline simd8<uint8_t>
|
|
pack_shifted_right(const simd16<uint16_t> &v0, const simd16<uint16_t> &v1) {
|
|
return __lsx_vssrlni_bu_h(v1.value, v0.value, N);
|
|
}
|
|
|
|
// Pack with the unsigned saturation of two uint16_t code units into single
|
|
// uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
|
|
const simd16<uint16_t> &v1) {
|
|
return pack_shifted_right<0>(v0, v1);
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
return __lsx_vshuf4i_b(this->value, 0b10110001);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
const auto sum_u32 = __lsx_vhaddw_wu_hu(value, value);
|
|
const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32);
|
|
|
|
return uint64_t(__lsx_vpickve2gr_du(sum_u64, 0)) +
|
|
uint64_t(__lsx_vpickve2gr_du(sum_u64, 1));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(
|
|
NUM_CHUNKS == 4,
|
|
"LOONGARCH kernel should use four registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
|
|
simd16x32<T> &
|
|
operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline
|
|
simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
|
|
const simd16<T> chunk2, const simd16<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd16x32(const T *ptr)
|
|
: chunks{simd16<T>::load(ptr),
|
|
simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
|
|
simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
this->chunks[2] = this->chunks[2].swap_bytes();
|
|
this->chunks[3] = this->chunks[3].swap_bytes();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
|
|
simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
|
|
uint16_t b) {
|
|
const auto bv = __lsx_vreplgr2vr_h(b);
|
|
return __lsx_vxor_v(a.value, bv);
|
|
}
|
|
|
|
simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> a,
|
|
const simd16<uint16_t> b) {
|
|
return __lsx_vmin_hu(a.value, b.value);
|
|
}
|
|
/* end file src/simdutf/lsx/simd16-inl.h */
|
|
/* begin file src/simdutf/lsx/simd32-inl.h */
|
|
template <typename T> struct simd32;
|
|
|
|
template <> struct simd32<uint32_t> {
|
|
__m128i value;
|
|
static const int SIZE = sizeof(value);
|
|
static const int ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd32(__m128i v) : value(v) {}
|
|
|
|
template <typename Ptr>
|
|
simdutf_really_inline simd32(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {}
|
|
|
|
// in-place operators
|
|
simdutf_really_inline simd32 &operator-=(const simd32 other) {
|
|
value = __lsx_vsub_w(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// members
|
|
simdutf_really_inline uint64_t sum() const {
|
|
return uint64_t(__lsx_vpickve2gr_wu(value, 0)) +
|
|
uint64_t(__lsx_vpickve2gr_wu(value, 1)) +
|
|
uint64_t(__lsx_vpickve2gr_wu(value, 2)) +
|
|
uint64_t(__lsx_vpickve2gr_wu(value, 3));
|
|
}
|
|
|
|
// static members
|
|
static simdutf_really_inline simd32<uint32_t> splat(uint32_t x) {
|
|
return __lsx_vreplgr2vr_w(x);
|
|
}
|
|
|
|
static simdutf_really_inline simd32<uint32_t> zero() {
|
|
return __lsx_vrepli_w(0);
|
|
}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
template <> struct simd32<bool> {
|
|
__m128i value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd32(__m128i v) : value(v) {}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return __lsx_vor_v(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator<(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return __lsx_vslt_wu(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return __lsx_vslt_wu(b.value, a.value);
|
|
}
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
simdutf_really_inline simd32<uint32_t> as_vector_u32(const simd32<bool> v) {
|
|
return v.value;
|
|
}
|
|
/* end file src/simdutf/lsx/simd32-inl.h */
|
|
/* begin file src/simdutf/lsx/simd64-inl.h */
|
|
template <typename T> struct simd64;
|
|
|
|
template <> struct simd64<uint64_t> {
|
|
__m128i value;
|
|
static const int SIZE = sizeof(value);
|
|
static const int ELEMENTS = SIZE / sizeof(uint64_t);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd64(__m128i v) : value(v) {}
|
|
|
|
template <typename Ptr>
|
|
simdutf_really_inline simd64(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {}
|
|
|
|
// in-place operators
|
|
simdutf_really_inline simd64 &operator+=(const simd64 other) {
|
|
value = __lsx_vadd_d(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// members
|
|
simdutf_really_inline uint64_t sum() const {
|
|
return uint64_t(__lsx_vpickve2gr_du(value, 0)) +
|
|
uint64_t(__lsx_vpickve2gr_du(value, 1));
|
|
}
|
|
|
|
// static members
|
|
static simdutf_really_inline simd64<uint64_t> zero() {
|
|
return __lsx_vrepli_d(0);
|
|
}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
template <> struct simd64<bool> {
|
|
__m128i value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd64(__m128i v) : value(v) {}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
|
|
const auto sum_u16 = __lsx_vhaddw_hu_bu(v, v);
|
|
const auto sum_u32 = __lsx_vhaddw_wu_hu(sum_u16, sum_u16);
|
|
const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32);
|
|
|
|
return simd64<uint64_t>(sum_u64);
|
|
}
|
|
/* end file src/simdutf/lsx/simd64-inl.h */
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_LSX_SIMD_H
|
|
/* end file src/simdutf/lsx/simd.h */
|
|
|
|
/* begin file src/simdutf/lsx/end.h */
|
|
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
/* end file src/simdutf/lsx/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_LSX
|
|
|
|
#endif // SIMDUTF_LSX_H
|
|
/* end file src/simdutf/lsx.h */
|
|
/* begin file src/simdutf/lasx.h */
|
|
#ifndef SIMDUTF_LASX_H
|
|
#define SIMDUTF_LASX_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "lasx.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
#ifndef SIMDUTF_IMPLEMENTATION_LASX
|
|
#define SIMDUTF_IMPLEMENTATION_LASX (SIMDUTF_IS_LASX)
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LASX && SIMDUTF_IS_LASX
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_LASX 1
|
|
#else
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_LASX 0
|
|
#endif
|
|
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_LASX
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for LoongArch ASX.
|
|
*/
|
|
namespace lasx {} // namespace lasx
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/lasx/implementation.h */
|
|
#ifndef SIMDUTF_LASX_IMPLEMENTATION_H
|
|
#define SIMDUTF_LASX_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("lasx", "LOONGARCH ASX",
|
|
internal::instruction_set::LSX |
|
|
internal::instruction_set::LASX) {}
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
};
|
|
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_LASX_IMPLEMENTATION_H
|
|
/* end file src/simdutf/lasx/implementation.h */
|
|
|
|
/* begin file src/simdutf/lasx/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "lasx"
|
|
// #define SIMDUTF_IMPLEMENTATION lasx
|
|
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
|
|
/* end file src/simdutf/lasx/begin.h */
|
|
|
|
// Declarations
|
|
/* begin file src/simdutf/lasx/intrinsics.h */
|
|
#ifndef SIMDUTF_LASX_INTRINSICS_H
|
|
#define SIMDUTF_LASX_INTRINSICS_H
|
|
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <lsxintrin.h>
|
|
#include <lasxintrin.h>
|
|
|
|
#if defined(__loongarch_asx)
|
|
#ifdef __clang__
|
|
#define VREGS_PREFIX "$vr"
|
|
#define XREGS_PREFIX "$xr"
|
|
#else // GCC
|
|
#define VREGS_PREFIX "$f"
|
|
#define XREGS_PREFIX "$f"
|
|
#endif
|
|
#define __ALL_REGS \
|
|
"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26," \
|
|
"27,28,29,30,31"
|
|
// Convert __m128i to __m256i
|
|
static inline __m256i ____m256i(__m128i in) {
|
|
__m256i out = __lasx_xvldi(0);
|
|
__asm__ volatile(".irp i," __ALL_REGS "\n\t"
|
|
" .ifc %[out], " XREGS_PREFIX "\\i \n\t"
|
|
" .irp j," __ALL_REGS "\n\t"
|
|
" .ifc %[in], " VREGS_PREFIX "\\j \n\t"
|
|
" xvpermi.q $xr\\i, $xr\\j, 0x0 \n\t"
|
|
" .endif \n\t"
|
|
" .endr \n\t"
|
|
" .endif \n\t"
|
|
".endr \n\t"
|
|
: [out] "+f"(out)
|
|
: [in] "f"(in));
|
|
return out;
|
|
}
|
|
// Convert two __m128i to __m256i
|
|
static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
|
|
__m256i out;
|
|
__asm__ volatile(".irp i," __ALL_REGS "\n\t"
|
|
" .ifc %[hi], " VREGS_PREFIX "\\i \n\t"
|
|
" .irp j," __ALL_REGS "\n\t"
|
|
" .ifc %[lo], " VREGS_PREFIX "\\j \n\t"
|
|
" xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
|
|
" .endif \n\t"
|
|
" .endr \n\t"
|
|
" .endif \n\t"
|
|
".endr \n\t"
|
|
".ifnc %[out], %[hi] \n\t"
|
|
".irp i," __ALL_REGS "\n\t"
|
|
" .ifc %[out], " XREGS_PREFIX "\\i \n\t"
|
|
" .irp j," __ALL_REGS "\n\t"
|
|
" .ifc %[hi], " VREGS_PREFIX "\\j \n\t"
|
|
" xvori.b $xr\\i, $xr\\j, 0 \n\t"
|
|
" .endif \n\t"
|
|
" .endr \n\t"
|
|
" .endif \n\t"
|
|
".endr \n\t"
|
|
".endif \n\t"
|
|
: [out] "=f"(out), [hi] "+f"(inhi)
|
|
: [lo] "f"(inlo));
|
|
return out;
|
|
}
|
|
// Convert __m256i low part to __m128i
|
|
static inline __m128i lasx_extracti128_lo(__m256i in) {
|
|
__m128i out;
|
|
__asm__ volatile(".ifnc %[out], %[in] \n\t"
|
|
".irp i," __ALL_REGS "\n\t"
|
|
" .ifc %[out], " VREGS_PREFIX "\\i \n\t"
|
|
" .irp j," __ALL_REGS "\n\t"
|
|
" .ifc %[in], " XREGS_PREFIX "\\j \n\t"
|
|
" vori.b $vr\\i, $vr\\j, 0 \n\t"
|
|
" .endif \n\t"
|
|
" .endr \n\t"
|
|
" .endif \n\t"
|
|
".endr \n\t"
|
|
".endif \n\t"
|
|
: [out] "=f"(out)
|
|
: [in] "f"(in));
|
|
return out;
|
|
}
|
|
// Convert __m256i high part to __m128i
|
|
static inline __m128i lasx_extracti128_hi(__m256i in) {
|
|
__m128i out;
|
|
__asm__ volatile(".irp i," __ALL_REGS "\n\t"
|
|
" .ifc %[out], " VREGS_PREFIX "\\i \n\t"
|
|
" .irp j," __ALL_REGS "\n\t"
|
|
" .ifc %[in], " XREGS_PREFIX "\\j \n\t"
|
|
" xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t"
|
|
" .endif \n\t"
|
|
" .endr \n\t"
|
|
" .endif \n\t"
|
|
".endr \n\t"
|
|
: [out] "=f"(out)
|
|
: [in] "f"(in));
|
|
return out;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
Encoding of argument for LoongArch64 xvldi instruction. See:
|
|
https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm
|
|
|
|
1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
|
|
|
|
2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
|
|
|
|
3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
|
|
|
|
4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
|
|
|
|
5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
|
|
|
|
6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
|
|
|
|
7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all
|
|
lanes
|
|
|
|
8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to
|
|
all lanes
|
|
|
|
9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
|
|
|
|
10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast
|
|
the result as 64-bit elements to all lanes
|
|
*/
|
|
|
|
namespace lasx_vldi {
|
|
|
|
template <uint16_t v> class const_u16 {
|
|
constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
|
|
constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
|
|
|
|
constexpr static bool is_case5 = uint16_t(b0) == v;
|
|
constexpr static bool is_case6 = (uint16_t(b1) << 8) == v;
|
|
constexpr static bool is_case9 = (b0 == b1);
|
|
constexpr static bool is_case10 =
|
|
((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00));
|
|
|
|
public:
|
|
constexpr static uint16_t operation = is_case5 ? 0b10100
|
|
: is_case6 ? 0b10101
|
|
: is_case9 ? 0b11000
|
|
: is_case10 ? 0x11001
|
|
: 0xffff;
|
|
|
|
constexpr static uint16_t byte =
|
|
is_case5 ? b0
|
|
: is_case6 ? b1
|
|
: is_case9 ? b0
|
|
: is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00))
|
|
: 0xffff;
|
|
|
|
constexpr static int value = int((operation << 8) | byte) - 8192;
|
|
constexpr static bool valid = operation != 0xffff;
|
|
};
|
|
|
|
template <uint32_t v> class const_u32 {
|
|
constexpr static const uint8_t b0 = (v & 0xff);
|
|
constexpr static const uint8_t b1 = ((v >> 8) & 0xff);
|
|
constexpr static const uint8_t b2 = ((v >> 16) & 0xff);
|
|
constexpr static const uint8_t b3 = ((v >> 24) & 0xff);
|
|
|
|
constexpr static bool is_case1 = (uint32_t(b0) == v);
|
|
constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v);
|
|
constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v);
|
|
constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v);
|
|
constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0);
|
|
constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0);
|
|
constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff);
|
|
constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff);
|
|
constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3);
|
|
constexpr static bool is_case10 =
|
|
((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
|
|
((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00));
|
|
|
|
public:
|
|
constexpr static uint16_t operation = is_case1 ? 0b10000
|
|
: is_case2 ? 0b10001
|
|
: is_case3 ? 0b10010
|
|
: is_case4 ? 0b10011
|
|
: is_case5 ? 0b10100
|
|
: is_case6 ? 0b10101
|
|
: is_case7 ? 0b10110
|
|
: is_case8 ? 0b10111
|
|
: is_case9 ? 0b11000
|
|
: is_case10 ? 0b11001
|
|
: 0xffff;
|
|
|
|
constexpr static uint16_t byte =
|
|
is_case1 ? b0
|
|
: is_case2 ? b1
|
|
: is_case3 ? b2
|
|
: is_case4 ? b3
|
|
: is_case5 ? b0
|
|
: is_case6 ? b1
|
|
: is_case7 ? b1
|
|
: is_case8 ? b2
|
|
: is_case9 ? b0
|
|
: is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) |
|
|
(b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00))
|
|
: 0xffff;
|
|
|
|
constexpr static int value = int((operation << 8) | byte) - 8192;
|
|
constexpr static bool valid = operation != 0xffff;
|
|
};
|
|
|
|
template <uint64_t v> class const_u64 {
|
|
constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
|
|
constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
|
|
constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff);
|
|
constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff);
|
|
constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff);
|
|
constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff);
|
|
constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff);
|
|
constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff);
|
|
|
|
constexpr static bool is_case10 =
|
|
((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
|
|
((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) &&
|
|
((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) &&
|
|
((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00));
|
|
|
|
public:
|
|
constexpr static bool is_32bit =
|
|
((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value;
|
|
constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation;
|
|
constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte;
|
|
|
|
constexpr static uint16_t operation = is_32bit ? op_32bit
|
|
: is_case10 ? 0x11001
|
|
: 0xffff;
|
|
|
|
constexpr static uint16_t byte =
|
|
is_32bit ? byte_32bit
|
|
: is_case10
|
|
? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) |
|
|
(b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) |
|
|
(b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00))
|
|
: 0xffff;
|
|
|
|
constexpr static int value = int((operation << 8) | byte) - 8192;
|
|
constexpr static bool valid = operation != 0xffff;
|
|
};
|
|
|
|
} // namespace lasx_vldi
|
|
|
|
// Uncomment when running under QEMU affected
|
|
// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865
|
|
// Versions <= 9.2.2 are affected, likely anything newer is correct.
|
|
#ifndef QEMU_VLDI_BUG
|
|
// #define QEMU_VLDI_BUG 1
|
|
#endif
|
|
|
|
#ifdef QEMU_VLDI_BUG
|
|
#define lasx_splat_u16(v) __lasx_xvreplgr2vr_h(v)
|
|
#define lasx_splat_u32(v) __lasx_xvreplgr2vr_w(v)
|
|
#else
|
|
template <uint16_t x> constexpr __m256i lasx_splat_u16_aux() {
|
|
constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512);
|
|
constexpr uint16_t imm10 = is_imm10 ? x : 0;
|
|
constexpr bool is_vldi = lasx_vldi::const_u16<x>::valid;
|
|
constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u16<x>::value : 0;
|
|
|
|
return is_imm10 ? __lasx_xvrepli_h(int16_t(imm10))
|
|
: is_vldi ? __lasx_xvldi(vldi_imm)
|
|
: __lasx_xvreplgr2vr_h(x);
|
|
}
|
|
|
|
template <uint32_t x> constexpr __m256i lasx_splat_u32_aux() {
|
|
constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512);
|
|
constexpr uint32_t imm10 = is_imm10 ? x : 0;
|
|
constexpr bool is_vldi = lasx_vldi::const_u32<x>::valid;
|
|
constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u32<x>::value : 0;
|
|
|
|
return is_imm10 ? __lasx_xvrepli_w(int32_t(imm10))
|
|
: is_vldi ? __lasx_xvldi(vldi_imm)
|
|
: __lasx_xvreplgr2vr_w(x);
|
|
}
|
|
|
|
#define lasx_splat_u16(v) lasx_splat_u16_aux<(v)>()
|
|
#define lasx_splat_u32(v) lasx_splat_u32_aux<(v)>()
|
|
#endif // QEMU_VLDI_BUG
|
|
|
|
#endif // SIMDUTF_LASX_INTRINSICS_H
|
|
/* end file src/simdutf/lasx/intrinsics.h */
|
|
/* begin file src/simdutf/lasx/bitmanipulation.h */
|
|
#ifndef SIMDUTF_LASX_BITMANIPULATION_H
|
|
#define SIMDUTF_LASX_BITMANIPULATION_H
|
|
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
|
|
}
|
|
|
|
#if SIMDUTF_NEED_TRAILING_ZEROES
|
|
// simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
|
|
// return __builtin_ctzll(input_num);
|
|
// }
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_LASX_BITMANIPULATION_H
|
|
/* end file src/simdutf/lasx/bitmanipulation.h */
|
|
/* begin file src/simdutf/lasx/simd.h */
|
|
#ifndef SIMDUTF_LASX_SIMD_H
|
|
#define SIMDUTF_LASX_SIMD_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
__attribute__((aligned(32))) static const uint8_t prev_shuf_table[32][32] = {
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
{0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
|
31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
|
|
{0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
|
|
30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
|
|
{0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
|
29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
|
|
{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
|
28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
|
{0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
|
|
27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
|
|
{0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8,
|
|
25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7,
|
|
24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6,
|
|
23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5,
|
|
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4,
|
|
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3,
|
|
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,
|
|
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
|
|
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1},
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0},
|
|
{15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
|
|
15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
|
|
14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
|
|
13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
|
|
12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
|
11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
|
10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
|
|
9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
|
|
8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
|
|
7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0},
|
|
{6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
|
|
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0},
|
|
{5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
|
|
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0},
|
|
{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
|
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0},
|
|
{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
|
|
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0},
|
|
{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
|
|
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0},
|
|
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
|
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
};
|
|
|
|
__attribute__((aligned(32))) static const uint8_t bitsel_mask_table[32][32] = {
|
|
{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0},
|
|
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0}};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template <typename Child> struct base {
|
|
__m256i value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base() : value{__m256i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base(const __m256i _value) : value(_value) {}
|
|
// Conversion to SIMD register
|
|
simdutf_really_inline operator const __m256i &() const { return this->value; }
|
|
simdutf_really_inline operator __m256i &() { return this->value; }
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
if (big_endian) {
|
|
__m256i zero = __lasx_xvldi(0);
|
|
__m256i in8 = __lasx_xvpermi_d(this->value, 0b11011000);
|
|
__m256i inlow = __lasx_xvilvl_b(in8, zero);
|
|
__m256i inhigh = __lasx_xvilvh_b(in8, zero);
|
|
__lasx_xvst(inlow, reinterpret_cast<uint16_t *>(ptr), 0);
|
|
__lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(ptr), 32);
|
|
} else {
|
|
__m256i inlow = __lasx_vext2xv_hu_bu(this->value);
|
|
__m256i inhigh = __lasx_vext2xv_hu_bu(
|
|
__lasx_xvpermi_q(this->value, this->value, 0b00000001));
|
|
__lasx_xvst(inlow, reinterpret_cast<__m256i *>(ptr), 0);
|
|
__lasx_xvst(inhigh, reinterpret_cast<__m256i *>(ptr), 32);
|
|
}
|
|
}
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
__m256i in32_0 = __lasx_vext2xv_wu_bu(this->value);
|
|
__lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(ptr), 0);
|
|
|
|
__m256i in8_1 = __lasx_xvpermi_d(this->value, 0b00000001);
|
|
__m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
|
|
__lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(ptr), 32);
|
|
|
|
__m256i in8_2 = __lasx_xvpermi_d(this->value, 0b00000010);
|
|
__m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
|
|
__lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(ptr), 64);
|
|
|
|
__m256i in8_3 = __lasx_xvpermi_d(this->value, 0b00000011);
|
|
__m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
|
|
__lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(ptr), 96);
|
|
}
|
|
// Bit operations
|
|
simdutf_really_inline Child operator|(const Child other) const {
|
|
return __lasx_xvor_v(this->value, other);
|
|
}
|
|
simdutf_really_inline Child operator&(const Child other) const {
|
|
return __lasx_xvand_v(this->value, other);
|
|
}
|
|
simdutf_really_inline Child operator^(const Child other) const {
|
|
return __lasx_xvxor_v(this->value, other);
|
|
}
|
|
simdutf_really_inline Child &operator|=(const Child other) {
|
|
auto this_cast = static_cast<Child *>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd8;
|
|
|
|
template <typename T, typename Mask = simd8<bool>>
|
|
struct base8 : base<simd8<T>> {
|
|
simdutf_really_inline base8() : base<simd8<T>>() {}
|
|
simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
|
|
const simd8<T> rhs) {
|
|
return __lasx_xvseq_b(lhs, rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<T>::value);
|
|
|
|
template <unsigned N = 1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
static_assert(N <= 16, "unsupported shift value");
|
|
|
|
if (!N)
|
|
return this->value;
|
|
|
|
__m256i zero = __lasx_xvldi(0);
|
|
__m256i result, shuf;
|
|
if (N < 16) {
|
|
shuf = __lasx_xvld(prev_shuf_table[N], 0);
|
|
|
|
result = __lasx_xvshuf_b(
|
|
__lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
|
|
shuf);
|
|
__m256i srl_prev = __lasx_xvbsrl_v(
|
|
__lasx_xvpermi_q(zero, prev_chunk.value, 0b00110001), (16 - N));
|
|
__m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
|
|
result = __lasx_xvbitsel_v(result, srl_prev, mask);
|
|
|
|
return result;
|
|
} else if (N == 16) {
|
|
return __lasx_xvpermi_q(this->value, prev_chunk.value, 0b00100001);
|
|
}
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base8<bool> {
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) {
|
|
return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd8() : base8() {}
|
|
simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline uint32_t to_bitmask() const {
|
|
__m256i mask = __lasx_xvmsknz_b(this->value);
|
|
uint32_t mask0 = __lasx_xvpickve2gr_wu(mask, 0);
|
|
uint32_t mask1 = __lasx_xvpickve2gr_wu(mask, 4);
|
|
return (mask0 | (mask1 << 16));
|
|
}
|
|
simdutf_really_inline bool any() const {
|
|
if (__lasx_xbz_b(this->value))
|
|
return false;
|
|
return true;
|
|
}
|
|
simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template <typename T> struct base8_numeric : base8<T> {
|
|
static simdutf_really_inline simd8<T> splat(T _value) {
|
|
return __lasx_xvreplgr2vr_b(_value);
|
|
}
|
|
static simdutf_really_inline simd8<T> zero() { return __lasx_xvldi(0); }
|
|
static simdutf_really_inline simd8<T> load(const T values[32]) {
|
|
return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
|
|
T v5, T v6, T v7, T v8, T v9,
|
|
T v10, T v11, T v12, T v13,
|
|
T v14, T v15) {
|
|
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
|
|
v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15);
|
|
}
|
|
|
|
simdutf_really_inline base8_numeric() : base8<T>() {}
|
|
simdutf_really_inline base8_numeric(const __m256i _value)
|
|
: base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[32]) const {
|
|
return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
__m256i origin = __lasx_xvand_v(this->value, __lasx_xvldi(0x1f));
|
|
return __lasx_xvshuf_b(__lasx_xvldi(0), lookup_table, origin);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdutf_really_inline simd8(const __m256i _value)
|
|
: base8_numeric<int8_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
|
|
simdutf_really_inline operator simd8<uint8_t>() const;
|
|
simdutf_really_inline bool is_ascii() const {
|
|
__m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
|
|
if (__lasx_xbnz_v(ascii_mask))
|
|
return false;
|
|
return true;
|
|
}
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
|
|
return __lasx_xvslt_b(other, this->value);
|
|
}
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
|
|
return __lasx_xvslt_b(this->value, other);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdutf_really_inline simd8(const __m256i _value)
|
|
: base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
|
|
uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
|
|
uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
|
|
uint8_t v31)
|
|
: simd8((__m256i)v32u8{v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10, v11, v12, v13, v14, v15,
|
|
v16, v17, v18, v19, v20, v21, v22, v23,
|
|
v24, v25, v26, v27, v28, v29, v30, v31}) {}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t>
|
|
saturating_sub(const simd8<uint8_t> other) const {
|
|
return __lasx_xvssub_bu(this->value, other);
|
|
}
|
|
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return __lasx_xvsle_bu(other, *this);
|
|
}
|
|
simdutf_really_inline simd8 &operator-=(const simd8<uint8_t> other) {
|
|
value = __lasx_xvsub_b(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline bool is_ascii() const {
|
|
__m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
|
|
if (__lasx_xbnz_v(ascii_mask))
|
|
return false;
|
|
return true;
|
|
}
|
|
simdutf_really_inline bool any_bits_set_anywhere() const {
|
|
if (__lasx_xbnz_v(this->value))
|
|
return true;
|
|
return false;
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
|
|
return __lasx_xvsrli_b(this->value, N);
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
|
|
return __lasx_xvslli_b(this->value, N);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum_bytes() const {
|
|
const auto sum_u16 = __lasx_xvhaddw_hu_bu(value, value);
|
|
const auto sum_u32 = __lasx_xvhaddw_wu_hu(sum_u16, sum_u16);
|
|
const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32);
|
|
|
|
return uint64_t(__lasx_xvpickve2gr_du(sum_u64, 0)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum_u64, 1)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum_u64, 2)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum_u64, 3));
|
|
}
|
|
};
|
|
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
|
|
return this->value;
|
|
}
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 2,
|
|
"LASX kernel should use two registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
|
|
: chunks{chunk0, chunk1} {}
|
|
simdutf_really_inline simd8x64(const T *ptr)
|
|
: chunks{simd8<T>::load(ptr),
|
|
simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r_hi = this->chunks[1].to_bitmask();
|
|
return r_lo | (r_hi << 32);
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
|
|
sizeof(simd8<T>) * 1);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
|
|
.to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
|
|
(simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
/* begin file src/simdutf/lasx/simd16-inl.h */
|
|
template <typename T> struct simd16;
|
|
|
|
template <typename T, typename Mask = simd16<bool>>
|
|
struct base16 : base<simd16<T>> {
|
|
using bitmask_type = uint32_t;
|
|
|
|
simdutf_really_inline base16() : base<simd16<T>>() {}
|
|
simdutf_really_inline base16(const __m256i _value)
|
|
: base<simd16<T>>(_value) {}
|
|
template <typename Pointer>
|
|
simdutf_really_inline base16(const Pointer *ptr)
|
|
: base16(__lasx_xvld(reinterpret_cast<const __m256i *>(ptr), 0)) {}
|
|
|
|
/// the size of vector in bytes
|
|
static const int SIZE = sizeof(base<simd16<T>>::value);
|
|
|
|
/// the number of elements of type T a vector can hold
|
|
static const int ELEMENTS = SIZE / sizeof(T);
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd16<bool> : base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) {
|
|
return __lasx_xvreplgr2vr_h(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd16() : base16() {}
|
|
simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline bitmask_type to_bitmask() const {
|
|
__m256i mask = __lasx_xvmsknz_b(this->value);
|
|
bitmask_type mask0 = __lasx_xvpickve2gr_wu(mask, 0);
|
|
bitmask_type mask1 = __lasx_xvpickve2gr_wu(mask, 4);
|
|
return (mask0 | (mask1 << 16));
|
|
}
|
|
simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template <typename T> struct base16_numeric : base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) {
|
|
return __lasx_xvreplgr2vr_h((uint16_t)_value);
|
|
}
|
|
static simdutf_really_inline simd16<T> zero() { return __lasx_xvldi(0); }
|
|
template <typename Pointer>
|
|
static simdutf_really_inline simd16<T> load(const Pointer values) {
|
|
return __lasx_xvld(values, 0);
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
simdutf_really_inline base16_numeric(const __m256i _value)
|
|
: base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const {
|
|
return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
|
|
};
|
|
|
|
// Unsigned code units
|
|
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
simdutf_really_inline simd16(const __m256i _value)
|
|
: base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t *values)
|
|
: simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16 &operator+=(const simd16 other) {
|
|
value = __lasx_xvadd_h(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
return __lasx_xvshuf4i_b(this->value, 0b10110001);
|
|
}
|
|
|
|
template <unsigned N>
|
|
static simdutf_really_inline simd8<uint8_t>
|
|
pack_shifted_right(const simd16<uint16_t> &v0, const simd16<uint16_t> &v1) {
|
|
return __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(v1.value, v0.value, N),
|
|
0b11011000);
|
|
}
|
|
|
|
// Pack with the unsigned saturation of two uint16_t code units into single
|
|
// uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
|
|
const simd16<uint16_t> &v1) {
|
|
|
|
return pack_shifted_right<0>(v0, v1);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t sum() const {
|
|
const auto sum_u32 = __lasx_xvhaddw_wu_hu(value, value);
|
|
const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32);
|
|
|
|
return uint64_t(__lasx_xvpickve2gr_du(sum_u64, 0)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum_u64, 1)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum_u64, 2)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum_u64, 3));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 2,
|
|
"LASX kernel should use two registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
|
|
simd16x32<T> &
|
|
operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd16x32(const simd16<T> chunk0,
|
|
const simd16<T> chunk1)
|
|
: chunks{chunk0, chunk1} {}
|
|
simdutf_really_inline simd16x32(const T *ptr)
|
|
: chunks{simd16<T>::load(ptr),
|
|
simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T *ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
|
|
simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> a,
|
|
const simd16<uint16_t> b) {
|
|
return __lasx_xvmin_hu(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd16<bool> operator==(const simd16<uint16_t> a,
|
|
uint16_t b) {
|
|
const auto bv = __lasx_xvreplgr2vr_h(b);
|
|
return __lasx_xvseq_h(a.value, bv);
|
|
}
|
|
/* end file src/simdutf/lasx/simd16-inl.h */
|
|
/* begin file src/simdutf/lasx/simd32-inl.h */
|
|
template <typename T> struct simd32;
|
|
|
|
template <> struct simd32<uint32_t> {
|
|
__m256i value;
|
|
static const int SIZE = sizeof(value);
|
|
static const int ELEMENTS = SIZE / sizeof(uint32_t);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd32(__m256i v) : value(v) {}
|
|
|
|
template <typename Ptr>
|
|
simdutf_really_inline simd32(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {}
|
|
|
|
// in-place operators
|
|
simdutf_really_inline simd32 &operator-=(const simd32 other) {
|
|
value = __lasx_xvsub_w(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// members
|
|
simdutf_really_inline uint64_t sum() const {
|
|
const auto odd = __lasx_xvsrli_d(value, 32);
|
|
const auto even = __lasx_xvand_v(value, __lasx_xvreplgr2vr_d(0xffffffff));
|
|
|
|
const auto sum64 = __lasx_xvadd_d(odd, even);
|
|
|
|
return uint64_t(__lasx_xvpickve2gr_du(sum64, 0)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum64, 1)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum64, 2)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(sum64, 3));
|
|
}
|
|
|
|
// static members
|
|
static simdutf_really_inline simd32<uint32_t> splat(uint32_t x) {
|
|
return __lasx_xvreplgr2vr_w(x);
|
|
}
|
|
|
|
static simdutf_really_inline simd32<uint32_t> zero() {
|
|
return __lasx_xvrepli_w(0);
|
|
}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
template <> struct simd32<bool> {
|
|
__m256i value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd32(__m256i v) : value(v) {}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return __lasx_xvor_v(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator<(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return __lasx_xvslt_wu(a.value, b.value);
|
|
}
|
|
|
|
simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
|
|
const simd32<uint32_t> b) {
|
|
return __lasx_xvslt_wu(b.value, a.value);
|
|
}
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
simdutf_really_inline simd32<uint32_t> as_vector_u32(const simd32<bool> v) {
|
|
return v.value;
|
|
}
|
|
/* end file src/simdutf/lasx/simd32-inl.h */
|
|
/* begin file src/simdutf/lasx/simd64-inl.h */
|
|
template <typename T> struct simd64;
|
|
|
|
template <> struct simd64<uint64_t> {
|
|
__m256i value;
|
|
static const int SIZE = sizeof(value);
|
|
static const int ELEMENTS = SIZE / sizeof(uint64_t);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd64(__m256i v) : value(v) {}
|
|
|
|
template <typename Ptr>
|
|
simdutf_really_inline simd64(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {}
|
|
|
|
// in-place operators
|
|
simdutf_really_inline simd64 &operator+=(const simd64 other) {
|
|
value = __lasx_xvadd_d(value, other.value);
|
|
return *this;
|
|
}
|
|
|
|
// members
|
|
simdutf_really_inline uint64_t sum() const {
|
|
return uint64_t(__lasx_xvpickve2gr_du(value, 0)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(value, 1)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(value, 2)) +
|
|
uint64_t(__lasx_xvpickve2gr_du(value, 3));
|
|
}
|
|
|
|
// static members
|
|
static simdutf_really_inline simd64<uint64_t> zero() {
|
|
return __lasx_xvrepli_d(0);
|
|
}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
template <> struct simd64<bool> {
|
|
__m256i value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// constructors
|
|
simdutf_really_inline simd64(__m256i v) : value(v) {}
|
|
};
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
|
|
const auto sum_u16 = __lasx_xvhaddw_hu_bu(v, v);
|
|
const auto sum_u32 = __lasx_xvhaddw_wu_hu(sum_u16, sum_u16);
|
|
const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32);
|
|
|
|
return simd64<uint64_t>(sum_u64);
|
|
}
|
|
/* end file src/simdutf/lasx/simd64-inl.h */
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_LASX_SIMD_H
|
|
/* end file src/simdutf/lasx/simd.h */
|
|
|
|
/* begin file src/simdutf/lasx/end.h */
|
|
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
/* end file src/simdutf/lasx/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_LASX
|
|
|
|
#endif // SIMDUTF_LASX_H
|
|
/* end file src/simdutf/lasx.h */
|
|
/* begin file src/simdutf/fallback.h */
|
|
#ifndef SIMDUTF_FALLBACK_H
|
|
#define SIMDUTF_FALLBACK_H
|
|
|
|
|
|
// Note that fallback.h is always imported last.
|
|
|
|
// Default Fallback to on unless a builtin implementation has already been
|
|
// selected.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || \
|
|
SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || \
|
|
SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV || \
|
|
SIMDUTF_CAN_ALWAYS_RUN_LSX || SIMDUTF_CAN_ALWAYS_RUN_LASX
|
|
#define SIMDUTF_IMPLEMENTATION_FALLBACK 0
|
|
#else
|
|
#define SIMDUTF_IMPLEMENTATION_FALLBACK 1
|
|
#endif
|
|
#endif
|
|
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Fallback implementation (runs on any machine).
|
|
*/
|
|
namespace fallback {} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/fallback/implementation.h */
|
|
#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
|
|
#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace fallback {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("fallback", "Generic fallback implementation",
|
|
0) {}
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *input,
|
|
size_t length) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result
|
|
validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result
|
|
validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final;
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused result
|
|
convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
|
|
char16_t *utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_buffer) const noexcept final;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t length,
|
|
char16_t *output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *buf,
|
|
size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *input, size_t length) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options =
|
|
last_chunk_handling_options::loose) const noexcept;
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept;
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
};
|
|
} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
|
|
/* end file src/simdutf/fallback/implementation.h */
|
|
|
|
/* begin file src/simdutf/fallback/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "fallback"
|
|
// #define SIMDUTF_IMPLEMENTATION fallback
|
|
/* end file src/simdutf/fallback/begin.h */
|
|
|
|
// Declarations
|
|
/* begin file src/simdutf/fallback/bitmanipulation.h */
|
|
#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
|
|
#define SIMDUTF_FALLBACK_BITMANIPULATION_H
|
|
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace fallback {
|
|
namespace {} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
|
|
/* end file src/simdutf/fallback/bitmanipulation.h */
|
|
|
|
/* begin file src/simdutf/fallback/end.h */
|
|
/* end file src/simdutf/fallback/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
#endif // SIMDUTF_FALLBACK_H
|
|
/* end file src/simdutf/fallback.h */
|
|
|
|
// The scalar routines should be included once.
|
|
/* begin file src/scalar/swap_bytes.h */
|
|
#ifndef SIMDUTF_SWAP_BYTES_H
|
|
#define SIMDUTF_SWAP_BYTES_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
|
|
inline simdutf_warn_unused uint16_t u16_swap_bytes(const uint16_t word) {
|
|
return uint16_t((word >> 8) | (word << 8));
|
|
}
|
|
|
|
inline simdutf_warn_unused uint32_t u32_swap_bytes(const uint32_t word) {
|
|
return ((word >> 24) & 0xff) | // move byte 3 to byte 0
|
|
((word << 8) & 0xff0000) | // move byte 1 to byte 2
|
|
((word >> 8) & 0xff00) | // move byte 2 to byte 1
|
|
((word << 24) & 0xff000000); // byte 0 to byte 3
|
|
}
|
|
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/swap_bytes.h */
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/scalar/ascii.h */
|
|
#ifndef SIMDUTF_ASCII_H
|
|
#define SIMDUTF_ASCII_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace ascii {
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
// Only used by the fallback kernel.
|
|
inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
uint64_t pos = 0;
|
|
// process in blocks of 16 bytes when possible
|
|
for (; pos + 16 <= len; pos += 16) {
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) != 0) {
|
|
return false;
|
|
}
|
|
}
|
|
// process the tail byte-by-byte
|
|
for (; pos < len; pos++) {
|
|
if (data[pos] >= 0b10000000) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
inline simdutf_warn_unused result validate_with_errors(const char *buf,
|
|
size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
// process in blocks of 16 bytes when possible
|
|
for (; pos + 16 <= len; pos += 16) {
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) != 0) {
|
|
for (; pos < len; pos++) {
|
|
if (data[pos] >= 0b10000000) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// process the tail byte-by-byte
|
|
for (; pos < len; pos++) {
|
|
if (data[pos] >= 0b10000000) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, pos);
|
|
}
|
|
|
|
} // namespace ascii
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/ascii.h */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/scalar/utf8.h */
|
|
#ifndef SIMDUTF_UTF8_H
|
|
#define SIMDUTF_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8 {
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
|
|
// only used by the fallback kernel.
|
|
// credit: based on code from Google Fuchsia (Apache Licensed)
|
|
inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
uint64_t pos = 0;
|
|
uint32_t code_point = 0;
|
|
while (pos < len) {
|
|
// check of the next 16 bytes are ascii.
|
|
uint64_t next_pos = pos + 16;
|
|
if (next_pos <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
pos = next_pos;
|
|
continue;
|
|
}
|
|
}
|
|
unsigned char byte = data[pos];
|
|
|
|
while (byte < 0b10000000) {
|
|
if (++pos == len) {
|
|
return true;
|
|
}
|
|
byte = data[pos];
|
|
}
|
|
|
|
if ((byte & 0b11100000) == 0b11000000) {
|
|
next_pos = pos + 2;
|
|
if (next_pos > len) {
|
|
return false;
|
|
}
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return false;
|
|
}
|
|
// range check
|
|
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if ((code_point < 0x80) || (0x7ff < code_point)) {
|
|
return false;
|
|
}
|
|
} else if ((byte & 0b11110000) == 0b11100000) {
|
|
next_pos = pos + 3;
|
|
if (next_pos > len) {
|
|
return false;
|
|
}
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return false;
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return false;
|
|
}
|
|
// range check
|
|
code_point = (byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if ((code_point < 0x800) || (0xffff < code_point) ||
|
|
(0xd7ff < code_point && code_point < 0xe000)) {
|
|
return false;
|
|
}
|
|
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
next_pos = pos + 4;
|
|
if (next_pos > len) {
|
|
return false;
|
|
}
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return false;
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return false;
|
|
}
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
|
|
return false;
|
|
}
|
|
// range check
|
|
code_point =
|
|
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff || 0x10ffff < code_point) {
|
|
return false;
|
|
}
|
|
} else {
|
|
// we may have a continuation
|
|
return false;
|
|
}
|
|
pos = next_pos;
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
inline simdutf_warn_unused result validate_with_errors(const char *buf,
|
|
size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
uint32_t code_point = 0;
|
|
while (pos < len) {
|
|
// check of the next 16 bytes are ascii.
|
|
size_t next_pos = pos + 16;
|
|
if (next_pos <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
pos = next_pos;
|
|
continue;
|
|
}
|
|
}
|
|
unsigned char byte = data[pos];
|
|
|
|
while (byte < 0b10000000) {
|
|
if (++pos == len) {
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
byte = data[pos];
|
|
}
|
|
|
|
if ((byte & 0b11100000) == 0b11000000) {
|
|
next_pos = pos + 2;
|
|
if (next_pos > len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
// range check
|
|
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if ((code_point < 0x80) || (0x7ff < code_point)) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
} else if ((byte & 0b11110000) == 0b11100000) {
|
|
next_pos = pos + 3;
|
|
if (next_pos > len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
// range check
|
|
code_point = (byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if ((code_point < 0x800) || (0xffff < code_point)) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (0xd7ff < code_point && code_point < 0xe000) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
next_pos = pos + 4;
|
|
if (next_pos > len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
// range check
|
|
code_point =
|
|
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (0x10ffff < code_point) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
} else {
|
|
// we either have too many continuation bytes or an invalid leading byte
|
|
if ((byte & 0b11000000) == 0b10000000) {
|
|
return result(error_code::TOO_LONG, pos);
|
|
} else {
|
|
return result(error_code::HEADER_BITS, pos);
|
|
}
|
|
}
|
|
pos = next_pos;
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
// Finds the previous leading byte starting backward from buf and validates with
|
|
// errors from there Used to pinpoint the location of an error when an invalid
|
|
// chunk is detected We assume that the stream starts with a leading byte, and
|
|
// to check that it is the case, we ask that you pass a pointer to the start of
|
|
// the stream (start).
|
|
inline simdutf_warn_unused result rewind_and_validate_with_errors(
|
|
const char *start, const char *buf, size_t len) noexcept {
|
|
// First check that we start with a leading byte
|
|
if ((*start & 0b11000000) == 0b10000000) {
|
|
return result(error_code::TOO_LONG, 0);
|
|
}
|
|
size_t extra_len{0};
|
|
// A leading byte cannot be further than 4 bytes away
|
|
for (int i = 0; i < 5; i++) {
|
|
unsigned char byte = *buf;
|
|
if ((byte & 0b11000000) != 0b10000000) {
|
|
break;
|
|
} else {
|
|
buf--;
|
|
extra_len++;
|
|
}
|
|
}
|
|
|
|
result res = validate_with_errors(buf, len + extra_len);
|
|
res.count -= extra_len;
|
|
return res;
|
|
}
|
|
|
|
inline size_t count_code_points(const char *buf, size_t len) {
|
|
const int8_t *p = reinterpret_cast<const int8_t *>(buf);
|
|
size_t counter{0};
|
|
for (size_t i = 0; i < len; i++) {
|
|
// -65 is 0b10111111, anything larger in two-complement's should start a new
|
|
// code point.
|
|
if (p[i] > -65) {
|
|
counter++;
|
|
}
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
|
|
const int8_t *p = reinterpret_cast<const int8_t *>(buf);
|
|
size_t counter{0};
|
|
for (size_t i = 0; i < len; i++) {
|
|
if (p[i] > -65) {
|
|
counter++;
|
|
}
|
|
if (uint8_t(p[i]) >= 240) {
|
|
counter++;
|
|
}
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
|
|
size_t length) {
|
|
if (length < 3) {
|
|
switch (length) {
|
|
case 2:
|
|
if (uint8_t(input[length - 1]) >= 0xc0) {
|
|
return length - 1;
|
|
} // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (uint8_t(input[length - 2]) >= 0xe0) {
|
|
return length - 2;
|
|
} // 3- and 4-byte characters with only 2 bytes left
|
|
return length;
|
|
case 1:
|
|
if (uint8_t(input[length - 1]) >= 0xc0) {
|
|
return length - 1;
|
|
} // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return length;
|
|
case 0:
|
|
return length;
|
|
}
|
|
}
|
|
if (uint8_t(input[length - 1]) >= 0xc0) {
|
|
return length - 1;
|
|
} // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (uint8_t(input[length - 2]) >= 0xe0) {
|
|
return length - 2;
|
|
} // 3- and 4-byte characters with only 1 byte left
|
|
if (uint8_t(input[length - 3]) >= 0xf0) {
|
|
return length - 3;
|
|
} // 4-byte characters with only 3 bytes left
|
|
return length;
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING || \
|
|
(SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1)
|
|
/* begin file src/scalar/utf16.h */
|
|
#ifndef SIMDUTF_UTF16_H
|
|
#define SIMDUTF_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline simdutf_warn_unused bool validate(const char16_t *data,
|
|
size_t len) noexcept {
|
|
uint64_t pos = 0;
|
|
while (pos < len) {
|
|
char16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xF800) == 0xD800) {
|
|
if (pos + 1 >= len) {
|
|
return false;
|
|
}
|
|
char16_t diff = char16_t(word - 0xD800);
|
|
if (diff > 0x3FF) {
|
|
return false;
|
|
}
|
|
char16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
char16_t diff2 = char16_t(next_word - 0xDC00);
|
|
if (diff2 > 0x3FF) {
|
|
return false;
|
|
}
|
|
pos += 2;
|
|
} else {
|
|
pos++;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline simdutf_warn_unused result validate_with_errors(const char16_t *data,
|
|
size_t len) noexcept {
|
|
size_t pos = 0;
|
|
while (pos < len) {
|
|
char16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xF800) == 0xD800) {
|
|
if (pos + 1 >= len) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
char16_t diff = char16_t(word - 0xD800);
|
|
if (diff > 0x3FF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
char16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
char16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if (diff2 > 0x3FF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
pos += 2;
|
|
} else {
|
|
pos++;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline size_t count_code_points(const char16_t *p, size_t len) {
|
|
// We are not BOM aware.
|
|
size_t counter{0};
|
|
for (size_t i = 0; i < len; i++) {
|
|
char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
|
|
counter += ((word & 0xFC00) != 0xDC00);
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline size_t utf8_length_from_utf16(const char16_t *p, size_t len) {
|
|
// We are not BOM aware.
|
|
size_t counter{0};
|
|
for (size_t i = 0; i < len; i++) {
|
|
char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
|
|
counter++; // ASCII
|
|
counter += static_cast<size_t>(
|
|
word >
|
|
0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
|
|
counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
|
|
(word >= 0xE000)); // three-byte
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline size_t utf32_length_from_utf16(const char16_t *p, size_t len) {
|
|
// We are not BOM aware.
|
|
size_t counter{0};
|
|
for (size_t i = 0; i < len; i++) {
|
|
char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
|
|
counter += ((word & 0xFC00) != 0xDC00);
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
simdutf_really_inline void
|
|
change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
|
|
for (size_t i = 0; i < size; i++) {
|
|
*output++ = char16_t(input[i] >> 8 | input[i] << 8);
|
|
}
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
|
|
size_t length) {
|
|
if (length <= 1) {
|
|
return length;
|
|
}
|
|
uint16_t last_word = uint16_t(input[length - 1]);
|
|
last_word = !match_system(big_endian) ? u16_swap_bytes(last_word) : last_word;
|
|
length -= ((last_word & 0xFC00) == 0xD800);
|
|
return length;
|
|
}
|
|
|
|
template <endianness big_endian> bool is_high_surrogate(char16_t c) {
|
|
c = !match_system(big_endian) ? u16_swap_bytes(c) : c;
|
|
return (0xd800 <= c && c <= 0xdbff);
|
|
}
|
|
|
|
template <endianness big_endian> bool is_low_surrogate(char16_t c) {
|
|
c = !match_system(big_endian) ? u16_swap_bytes(c) : c;
|
|
return (0xdc00 <= c && c <= 0xdfff);
|
|
}
|
|
|
|
// variable templates are a C++14 extension
|
|
template <endianness big_endian> char16_t replacement() {
|
|
return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
void to_well_formed_utf16(const char16_t *input, size_t len, char16_t *output) {
|
|
const char16_t replacement = utf16::replacement<big_endian>();
|
|
bool high_surrogate_prev = false, high_surrogate, low_surrogate;
|
|
size_t i = 0;
|
|
for (; i < len; i++) {
|
|
char16_t c = input[i];
|
|
high_surrogate = is_high_surrogate<big_endian>(c);
|
|
low_surrogate = is_low_surrogate<big_endian>(c);
|
|
if (high_surrogate_prev && !low_surrogate) {
|
|
output[i - 1] = replacement;
|
|
}
|
|
|
|
if (!high_surrogate_prev && low_surrogate) {
|
|
output[i] = replacement;
|
|
} else {
|
|
output[i] = input[i];
|
|
}
|
|
high_surrogate_prev = high_surrogate;
|
|
}
|
|
|
|
/* string may not end with high surrogate */
|
|
if (high_surrogate_prev) {
|
|
output[i - 1] = replacement;
|
|
}
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING ||
|
|
// (SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1)
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/scalar/utf32.h */
|
|
#ifndef SIMDUTF_UTF32_H
|
|
#define SIMDUTF_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
inline simdutf_warn_unused bool validate(const char32_t *buf,
|
|
size_t len) noexcept {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
uint64_t pos = 0;
|
|
for (; pos < len; pos++) {
|
|
uint32_t word = data[pos];
|
|
if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline simdutf_warn_unused result validate_with_errors(const char32_t *buf,
|
|
size_t len) noexcept {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
for (; pos < len; pos++) {
|
|
uint32_t word = data[pos];
|
|
if (word > 0x10FFFF) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, pos);
|
|
}
|
|
|
|
inline size_t utf8_length_from_utf32(const char32_t *buf, size_t len) {
|
|
// We are not BOM aware.
|
|
const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t counter{0};
|
|
for (size_t i = 0; i < len; i++) {
|
|
// credit: @ttsugriy for the vectorizable approach
|
|
counter++; // ASCII
|
|
counter += static_cast<size_t>(p[i] > 0x7F); // two-byte
|
|
counter += static_cast<size_t>(p[i] > 0x7FF); // three-byte
|
|
counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
inline size_t utf16_length_from_utf32(const char32_t *buf, size_t len) {
|
|
// We are not BOM aware.
|
|
const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t counter{0};
|
|
for (size_t i = 0; i < len; i++) {
|
|
counter++; // non-surrogate word
|
|
counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/latin1.h */
|
|
#ifndef SIMDUTF_LATIN1_H
|
|
#define SIMDUTF_LATIN1_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace latin1 {
|
|
|
|
simdutf_really_inline size_t utf8_length_from_latin1(const char *buf,
|
|
size_t len) {
|
|
const uint8_t *c = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t answer = 0;
|
|
for (size_t i = 0; i < len; i++) {
|
|
if ((c[i] >> 7)) {
|
|
answer++;
|
|
}
|
|
}
|
|
return answer + len;
|
|
}
|
|
|
|
} // namespace latin1
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/latin1.h */
|
|
#endif // SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/scalar/base64.h */
|
|
#ifndef SIMDUTF_BASE64_H
|
|
#define SIMDUTF_BASE64_H
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <iostream>
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace base64 {
|
|
|
|
// This function is not expected to be fast. Do not use in long loops.
|
|
template <class char_type> bool is_ascii_white_space(char_type c) {
|
|
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
|
|
}
|
|
|
|
template <class char_type> bool is_ascii_white_space_or_padding(char_type c) {
|
|
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ||
|
|
c == '=';
|
|
}
|
|
|
|
template <class char_type> bool is_eight_byte(char_type c) {
|
|
if (sizeof(char_type) == 1) {
|
|
return true;
|
|
}
|
|
return uint8_t(c) == c;
|
|
}
|
|
|
|
// Returns true upon success. The destination buffer must be large enough.
|
|
// This functions assumes that the padding (=) has been removed.
|
|
template <class char_type>
|
|
full_result
|
|
base64_tail_decode(char *dst, const char_type *src, size_t length,
|
|
size_t padded_characters, // number of padding characters
|
|
// '=', typically 0, 1, 2.
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
// This looks like 5 branches, but we expect the compiler to resolve this to a
|
|
// single branch:
|
|
const uint8_t *to_base64 = (options & base64_url)
|
|
? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
const uint32_t *d0 = (options & base64_url)
|
|
? tables::base64::base64_url::d0
|
|
: tables::base64::base64_default::d0;
|
|
const uint32_t *d1 = (options & base64_url)
|
|
? tables::base64::base64_url::d1
|
|
: tables::base64::base64_default::d1;
|
|
const uint32_t *d2 = (options & base64_url)
|
|
? tables::base64::base64_url::d2
|
|
: tables::base64::base64_default::d2;
|
|
const uint32_t *d3 = (options & base64_url)
|
|
? tables::base64::base64_url::d3
|
|
: tables::base64::base64_default::d3;
|
|
|
|
const char_type *srcend = src + length;
|
|
const char_type *srcinit = src;
|
|
const char *dstinit = dst;
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
|
|
uint32_t x;
|
|
size_t idx;
|
|
uint8_t buffer[4];
|
|
while (true) {
|
|
while (src + 4 <= srcend && is_eight_byte(src[0]) &&
|
|
is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
|
|
is_eight_byte(src[3]) &&
|
|
(x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
|
|
d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
|
|
if (match_system(endianness::BIG)) {
|
|
x = scalar::u32_swap_bytes(x);
|
|
}
|
|
std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
|
|
dst += 3;
|
|
src += 4;
|
|
}
|
|
idx = 0;
|
|
// we need at least four characters.
|
|
#ifdef __clang__
|
|
// If possible, we read four characters at a time. (It is an optimization.)
|
|
if (ignore_garbage && src + 4 <= srcend) {
|
|
char_type c0 = src[0];
|
|
char_type c1 = src[1];
|
|
char_type c2 = src[2];
|
|
char_type c3 = src[3];
|
|
uint8_t code0 = to_base64[uint8_t(c0)];
|
|
uint8_t code1 = to_base64[uint8_t(c1)];
|
|
uint8_t code2 = to_base64[uint8_t(c2)];
|
|
uint8_t code3 = to_base64[uint8_t(c3)];
|
|
buffer[idx] = code0;
|
|
idx += (is_eight_byte(c0) && code0 <= 63);
|
|
buffer[idx] = code1;
|
|
idx += (is_eight_byte(c1) && code1 <= 63);
|
|
buffer[idx] = code2;
|
|
idx += (is_eight_byte(c2) && code2 <= 63);
|
|
buffer[idx] = code3;
|
|
idx += (is_eight_byte(c3) && code3 <= 63);
|
|
src += 4;
|
|
}
|
|
#endif
|
|
while ((idx < 4) && (src < srcend)) {
|
|
char_type c = *src;
|
|
uint8_t code = to_base64[uint8_t(c)];
|
|
buffer[idx] = uint8_t(code);
|
|
if (is_eight_byte(c) && code <= 63) {
|
|
idx++;
|
|
} else if (!ignore_garbage &&
|
|
(code > 64 || !scalar::base64::is_eight_byte(c))) {
|
|
return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
} else {
|
|
// We have a space or a newline or garbage. We ignore it.
|
|
}
|
|
src++;
|
|
}
|
|
if (idx != 4) {
|
|
if (!ignore_garbage &&
|
|
last_chunk_options == last_chunk_handling_options::strict &&
|
|
(idx != 1) && ((idx + padded_characters) & 3) != 0) {
|
|
// The partial chunk was at src - idx
|
|
return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
} else if (!ignore_garbage &&
|
|
last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial &&
|
|
(idx != 1) && ((idx + padded_characters) & 3) != 0) {
|
|
// Rewind src to before partial chunk
|
|
src -= idx;
|
|
return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
|
|
} else {
|
|
if (idx == 2) {
|
|
uint32_t triple =
|
|
(uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
|
|
if (!ignore_garbage &&
|
|
(last_chunk_options == last_chunk_handling_options::strict) &&
|
|
(triple & 0xffff)) {
|
|
return {BASE64_EXTRA_BITS, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
if (match_system(endianness::BIG)) {
|
|
triple <<= 8;
|
|
std::memcpy(dst, &triple, 1);
|
|
} else {
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
triple >>= 8;
|
|
std::memcpy(dst, &triple, 1);
|
|
}
|
|
dst += 1;
|
|
} else if (idx == 3) {
|
|
uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
|
|
(uint32_t(buffer[1]) << 2 * 6) +
|
|
(uint32_t(buffer[2]) << 1 * 6);
|
|
if (!ignore_garbage &&
|
|
(last_chunk_options == last_chunk_handling_options::strict) &&
|
|
(triple & 0xff)) {
|
|
return {BASE64_EXTRA_BITS, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
if (match_system(endianness::BIG)) {
|
|
triple <<= 8;
|
|
std::memcpy(dst, &triple, 2);
|
|
} else {
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
triple >>= 8;
|
|
std::memcpy(dst, &triple, 2);
|
|
}
|
|
dst += 2;
|
|
} else if (!ignore_garbage && idx == 1) {
|
|
return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
|
|
uint32_t triple =
|
|
(uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
|
|
(uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
|
|
if (match_system(endianness::BIG)) {
|
|
triple <<= 8;
|
|
std::memcpy(dst, &triple, 3);
|
|
} else {
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
triple >>= 8;
|
|
std::memcpy(dst, &triple, 3);
|
|
}
|
|
dst += 3;
|
|
}
|
|
}
|
|
|
|
// like base64_tail_decode, but it will not write past the end of the output
|
|
// buffer. The outlen paramter is modified to reflect the number of bytes
|
|
// written. This functions assumes that the padding (=) has been removed.
|
|
template <class char_type>
|
|
result base64_tail_decode_safe(
|
|
char *dst, size_t &outlen, const char_type *&srcr, size_t length,
|
|
size_t padded_characters, // number of padding characters '=', typically 0,
|
|
// 1, 2.
|
|
base64_options options, last_chunk_handling_options last_chunk_options) {
|
|
const char_type *src = srcr;
|
|
if (length == 0) {
|
|
outlen = 0;
|
|
return {SUCCESS, 0};
|
|
}
|
|
// This looks like 5 branches, but we expect the compiler to resolve this to a
|
|
// single branch:
|
|
const uint8_t *to_base64 = (options & base64_url)
|
|
? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
const uint32_t *d0 = (options & base64_url)
|
|
? tables::base64::base64_url::d0
|
|
: tables::base64::base64_default::d0;
|
|
const uint32_t *d1 = (options & base64_url)
|
|
? tables::base64::base64_url::d1
|
|
: tables::base64::base64_default::d1;
|
|
const uint32_t *d2 = (options & base64_url)
|
|
? tables::base64::base64_url::d2
|
|
: tables::base64::base64_default::d2;
|
|
const uint32_t *d3 = (options & base64_url)
|
|
? tables::base64::base64_url::d3
|
|
: tables::base64::base64_default::d3;
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
|
|
const char_type *srcend = src + length;
|
|
const char_type *srcinit = src;
|
|
const char *dstinit = dst;
|
|
const char *dstend = dst + outlen;
|
|
|
|
uint32_t x;
|
|
size_t idx;
|
|
uint8_t buffer[4];
|
|
while (true) {
|
|
while (src + 4 <= srcend && is_eight_byte(src[0]) &&
|
|
is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
|
|
is_eight_byte(src[3]) &&
|
|
(x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
|
|
d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
|
|
if (dstend - dst < 3) {
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
|
|
}
|
|
if (match_system(endianness::BIG)) {
|
|
x = scalar::u32_swap_bytes(x);
|
|
}
|
|
std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
|
|
dst += 3;
|
|
src += 4;
|
|
}
|
|
idx = 0;
|
|
const char_type *srccur = src;
|
|
// We need at least four characters.
|
|
#ifdef __clang__
|
|
// If possible, we read four characters at a time. (It is an optimization.)
|
|
if (ignore_garbage && src + 4 <= srcend) {
|
|
char_type c0 = src[0];
|
|
char_type c1 = src[1];
|
|
char_type c2 = src[2];
|
|
char_type c3 = src[3];
|
|
uint8_t code0 = to_base64[uint8_t(c0)];
|
|
uint8_t code1 = to_base64[uint8_t(c1)];
|
|
uint8_t code2 = to_base64[uint8_t(c2)];
|
|
uint8_t code3 = to_base64[uint8_t(c3)];
|
|
buffer[idx] = code0;
|
|
idx += (is_eight_byte(c0) && code0 <= 63);
|
|
buffer[idx] = code1;
|
|
idx += (is_eight_byte(c1) && code1 <= 63);
|
|
buffer[idx] = code2;
|
|
idx += (is_eight_byte(c2) && code2 <= 63);
|
|
buffer[idx] = code3;
|
|
idx += (is_eight_byte(c3) && code3 <= 63);
|
|
src += 4;
|
|
}
|
|
#endif
|
|
while (idx < 4 && src < srcend) {
|
|
char_type c = *src;
|
|
uint8_t code = to_base64[uint8_t(c)];
|
|
|
|
buffer[idx] = uint8_t(code);
|
|
if (is_eight_byte(c) && code <= 63) {
|
|
idx++;
|
|
} else if (!ignore_garbage &&
|
|
(code > 64 || !scalar::base64::is_eight_byte(c))) {
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
|
|
} else {
|
|
// We have a space or a newline or garbage. We ignore it.
|
|
}
|
|
src++;
|
|
}
|
|
if (idx != 4) {
|
|
if (!ignore_garbage &&
|
|
last_chunk_options == last_chunk_handling_options::strict &&
|
|
((idx + padded_characters) & 3) != 0) {
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
|
|
} else if (!ignore_garbage &&
|
|
last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial &&
|
|
((idx + padded_characters) & 3) != 0) {
|
|
// Rewind src to before partial chunk
|
|
srcr = srccur;
|
|
outlen = size_t(dst - dstinit);
|
|
return {SUCCESS, size_t(dst - dstinit)};
|
|
} else { // loose mode
|
|
if (idx == 0) {
|
|
// No data left; return success
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {SUCCESS, size_t(dst - dstinit)};
|
|
} else if (!ignore_garbage && idx == 1) {
|
|
// Error: Incomplete chunk of length 1 is invalid in loose mode
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
|
|
} else if (idx == 2 || idx == 3) {
|
|
// Check if there's enough space in the destination buffer
|
|
size_t required_space = (idx == 2) ? 1 : 2;
|
|
if (size_t(dstend - dst) < required_space) {
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
|
|
}
|
|
uint32_t triple = 0;
|
|
if (idx == 2) {
|
|
triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12);
|
|
if (!ignore_garbage &&
|
|
(last_chunk_options == last_chunk_handling_options::strict) &&
|
|
(triple & 0xffff)) {
|
|
srcr = src;
|
|
return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
|
|
}
|
|
// Extract the first byte
|
|
triple >>= 16;
|
|
dst[0] = static_cast<char>(triple & 0xFF);
|
|
dst += 1;
|
|
} else if (idx == 3) {
|
|
triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12) +
|
|
(uint32_t(buffer[2]) << 6);
|
|
if (!ignore_garbage &&
|
|
(last_chunk_options == last_chunk_handling_options::strict) &&
|
|
(triple & 0xff)) {
|
|
srcr = src;
|
|
return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
|
|
}
|
|
// Extract the first two bytes
|
|
triple >>= 8;
|
|
dst[0] = static_cast<char>((triple >> 8) & 0xFF);
|
|
dst[1] = static_cast<char>(triple & 0xFF);
|
|
dst += 2;
|
|
}
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {SUCCESS, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
}
|
|
|
|
if (dstend - dst < 3) {
|
|
outlen = size_t(dst - dstinit);
|
|
srcr = src;
|
|
return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
|
|
}
|
|
uint32_t triple = (uint32_t(buffer[0]) << 18) +
|
|
(uint32_t(buffer[1]) << 12) + (uint32_t(buffer[2]) << 6) +
|
|
(uint32_t(buffer[3]));
|
|
if (match_system(endianness::BIG)) {
|
|
triple <<= 8;
|
|
std::memcpy(dst, &triple, 3);
|
|
} else {
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
triple >>= 8;
|
|
std::memcpy(dst, &triple, 3);
|
|
}
|
|
dst += 3;
|
|
}
|
|
}
|
|
|
|
// Returns the number of bytes written. The destination buffer must be large
|
|
// enough. It will add padding (=) if needed.
|
|
size_t tail_encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
// By default, we use padding if we are not using the URL variant.
|
|
// This is check with ((options & base64_url) == 0) which returns true if we
|
|
// are not using the URL variant. However, we also allow 'inversion' of the
|
|
// convention with the base64_reverse_padding option. If the
|
|
// base64_reverse_padding option is set, we use padding if we are using the
|
|
// URL variant, and we omit it if we are not using the URL variant. This is
|
|
// checked with
|
|
// ((options & base64_reverse_padding) == base64_reverse_padding).
|
|
bool use_padding =
|
|
((options & base64_url) == 0) ^
|
|
((options & base64_reverse_padding) == base64_reverse_padding);
|
|
// This looks like 3 branches, but we expect the compiler to resolve this to
|
|
// a single branch:
|
|
const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
|
|
: tables::base64::base64_default::e0;
|
|
const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
|
|
: tables::base64::base64_default::e1;
|
|
const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
|
|
: tables::base64::base64_default::e2;
|
|
char *out = dst;
|
|
size_t i = 0;
|
|
uint8_t t1, t2, t3;
|
|
for (; i + 2 < srclen; i += 3) {
|
|
t1 = uint8_t(src[i]);
|
|
t2 = uint8_t(src[i + 1]);
|
|
t3 = uint8_t(src[i + 2]);
|
|
*out++ = e0[t1];
|
|
*out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
|
|
*out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
|
|
*out++ = e2[t3];
|
|
}
|
|
switch (srclen - i) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
t1 = uint8_t(src[i]);
|
|
*out++ = e0[t1];
|
|
*out++ = e1[(t1 & 0x03) << 4];
|
|
if (use_padding) {
|
|
*out++ = '=';
|
|
*out++ = '=';
|
|
}
|
|
break;
|
|
default: /* case 2 */
|
|
t1 = uint8_t(src[i]);
|
|
t2 = uint8_t(src[i + 1]);
|
|
*out++ = e0[t1];
|
|
*out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
|
|
*out++ = e2[(t2 & 0x0F) << 2];
|
|
if (use_padding) {
|
|
*out++ = '=';
|
|
}
|
|
}
|
|
return (size_t)(out - dst);
|
|
}
|
|
|
|
template <class char_type>
|
|
simdutf_warn_unused size_t maximal_binary_length_from_base64(
|
|
const char_type *input, size_t length) noexcept {
|
|
// We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
|
|
size_t padding = 0;
|
|
if (length > 0) {
|
|
if (input[length - 1] == '=') {
|
|
padding++;
|
|
if (length > 1 && input[length - 2] == '=') {
|
|
padding++;
|
|
}
|
|
}
|
|
}
|
|
size_t actual_length = length - padding;
|
|
if (actual_length % 4 <= 1) {
|
|
return actual_length / 4 * 3;
|
|
}
|
|
// if we have a valid input, then the remainder must be 2 or 3 adding one or
|
|
// two extra bytes.
|
|
return actual_length / 4 * 3 + (actual_length % 4) - 1;
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
base64_length_from_binary(size_t length, base64_options options) noexcept {
|
|
// By default, we use padding if we are not using the URL variant.
|
|
// This is check with ((options & base64_url) == 0) which returns true if we
|
|
// are not using the URL variant. However, we also allow 'inversion' of the
|
|
// convention with the base64_reverse_padding option. If the
|
|
// base64_reverse_padding option is set, we use padding if we are using the
|
|
// URL variant, and we omit it if we are not using the URL variant. This is
|
|
// checked with
|
|
// ((options & base64_reverse_padding) == base64_reverse_padding).
|
|
bool use_padding =
|
|
((options & base64_url) == 0) ^
|
|
((options & base64_reverse_padding) == base64_reverse_padding);
|
|
if (!use_padding) {
|
|
return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
|
|
}
|
|
return (length + 2) / 3 *
|
|
4; // We use padding to make the length a multiple of 4.
|
|
}
|
|
|
|
} // namespace base64
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/base64.h */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
|
|
#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
|
|
#define SIMDUTF_VALID_UTF32_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf8 {
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
|
|
// only used by the fallback and POWER kernel
|
|
inline size_t convert_valid(const char32_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 2 ASCII characters
|
|
if (pos + 2 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF80FFFFFF80) == 0) {
|
|
*utf8_output++ = char(buf[pos]);
|
|
*utf8_output++ = char(buf[pos + 1]);
|
|
pos += 2;
|
|
continue;
|
|
}
|
|
}
|
|
uint32_t word = data[pos];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
|
|
|
|
} // namespace utf32_to_utf8
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
|
|
/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
|
|
#ifndef SIMDUTF_UTF32_TO_UTF8_H
|
|
#define SIMDUTF_UTF32_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf8 {
|
|
|
|
inline size_t convert(const char32_t *buf, size_t len, char *utf8_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 2 ASCII characters
|
|
if (pos + 2 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF80FFFFFF80) == 0) {
|
|
*utf8_output++ = char(buf[pos]);
|
|
*utf8_output++ = char(buf[pos + 1]);
|
|
pos += 2;
|
|
continue;
|
|
}
|
|
}
|
|
uint32_t word = data[pos];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return 0;
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word > 0x10FFFF) {
|
|
return 0;
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
|
|
inline result convert_with_errors(const char32_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 2 ASCII characters
|
|
if (pos + 2 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF80FFFFFF80) == 0) {
|
|
*utf8_output++ = char(buf[pos]);
|
|
*utf8_output++ = char(buf[pos + 1]);
|
|
pos += 2;
|
|
continue;
|
|
}
|
|
}
|
|
uint32_t word = data[pos];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word > 0x10FFFF) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf8_output - start);
|
|
}
|
|
|
|
} // namespace utf32_to_utf8
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
|
|
#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
|
|
#define SIMDUTF_VALID_UTF32_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
while (pos < len) {
|
|
uint32_t word = data[pos];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(uint16_t(word)))
|
|
: char16_t(word);
|
|
pos++;
|
|
} else {
|
|
// will generate a surrogate pair
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = u16_swap_bytes(high_surrogate);
|
|
low_surrogate = u16_swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos++;
|
|
}
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf32_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
|
|
/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
|
|
#ifndef SIMDUTF_UTF32_TO_UTF16_H
|
|
#define SIMDUTF_UTF32_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char32_t *buf, size_t len, char16_t *utf16_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
while (pos < len) {
|
|
uint32_t word = data[pos];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return 0;
|
|
}
|
|
// will not generate a surrogate pair
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(uint16_t(word)))
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return 0;
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = u16_swap_bytes(high_surrogate);
|
|
low_surrogate = u16_swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
pos++;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
while (pos < len) {
|
|
uint32_t word = data[pos];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
// will not generate a surrogate pair
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(uint16_t(word)))
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = u16_swap_bytes(high_surrogate);
|
|
low_surrogate = u16_swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
pos++;
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
} // namespace utf32_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
|
|
#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
|
|
#define SIMDUTF_VALID_UTF16_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf8 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char16_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 4 ASCII characters
|
|
if (pos + 4 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if (!match_system(big_endian)) {
|
|
v = (v >> 8) | (v << (64 - 8));
|
|
}
|
|
if ((v & 0xFF80FF80FF80FF80) == 0) {
|
|
size_t final_pos = pos + 4;
|
|
while (pos < final_pos) {
|
|
*utf8_output++ = !match_system(big_endian)
|
|
? char(u16_swap_bytes(buf[pos]))
|
|
: char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
uint16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xFF80) == 0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if ((word & 0xF800) == 0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if (pos + 1 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
|
|
} // namespace utf16_to_utf8
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
|
|
/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
|
|
#ifndef SIMDUTF_UTF16_TO_UTF8_H
|
|
#define SIMDUTF_UTF16_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf8 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char16_t *buf, size_t len, char *utf8_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 bytes
|
|
if (pos + 4 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if (!match_system(big_endian)) {
|
|
v = (v >> 8) | (v << (64 - 8));
|
|
}
|
|
if ((v & 0xFF80FF80FF80FF80) == 0) {
|
|
size_t final_pos = pos + 4;
|
|
while (pos < final_pos) {
|
|
*utf8_output++ = !match_system(big_endian)
|
|
? char(u16_swap_bytes(buf[pos]))
|
|
: char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xFF80) == 0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if ((word & 0xF800) == 0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
if (pos + 1 >= len) {
|
|
return 0;
|
|
}
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if (diff > 0x3FF) {
|
|
return 0;
|
|
}
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if (diff2 > 0x3FF) {
|
|
return 0;
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char16_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 bytes
|
|
if (pos + 4 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if (!match_system(big_endian))
|
|
v = (v >> 8) | (v << (64 - 8));
|
|
if ((v & 0xFF80FF80FF80FF80) == 0) {
|
|
size_t final_pos = pos + 4;
|
|
while (pos < final_pos) {
|
|
*utf8_output++ = !match_system(big_endian)
|
|
? char(u16_swap_bytes(buf[pos]))
|
|
: char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xFF80) == 0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if ((word & 0xF800) == 0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
if (pos + 1 >= len) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if (diff > 0x3FF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if (diff2 > 0x3FF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf8_output - start);
|
|
}
|
|
|
|
} // namespace utf16_to_utf8
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
|
|
#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
|
|
#define SIMDUTF_VALID_UTF16_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf32 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
while (pos < len) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
// No surrogate pair, extend 16-bit word to 32-bit word
|
|
*utf32_output++ = char32_t(word);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if (pos + 1 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf16_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
|
|
/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
|
|
#ifndef SIMDUTF_UTF16_TO_UTF32_H
|
|
#define SIMDUTF_UTF16_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf32 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char16_t *buf, size_t len, char32_t *utf32_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
while (pos < len) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
// No surrogate pair, extend 16-bit word to 32-bit word
|
|
*utf32_output++ = char32_t(word);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if (diff > 0x3FF) {
|
|
return 0;
|
|
}
|
|
if (pos + 1 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if (diff2 > 0x3FF) {
|
|
return 0;
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
while (pos < len) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
// No surrogate pair, extend 16-bit word to 32-bit word
|
|
*utf32_output++ = char32_t(word);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if (diff > 0x3FF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
if (pos + 1 >= len) {
|
|
return result(error_code::SURROGATE, pos);
|
|
} // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? u16_swap_bytes(data[pos + 1])
|
|
: data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if (diff2 > 0x3FF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
} // namespace utf16_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && \
|
|
(SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1)
|
|
/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
|
|
#define SIMDUTF_VALID_UTF8_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 ASCII bytes
|
|
if (pos + 8 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 8;
|
|
while (pos < final_pos) {
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(buf[pos]))
|
|
: char16_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(leading_byte))
|
|
: char16_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if (pos + 1 >= len) {
|
|
break;
|
|
} // minimal bound checking
|
|
uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
|
|
(data[pos + 1] & 0b00111111));
|
|
if (!match_system(big_endian)) {
|
|
code_point = u16_swap_bytes(uint16_t(code_point));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if (pos + 2 >= len) {
|
|
break;
|
|
} // minimal bound checking
|
|
uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) |
|
|
((data[pos + 1] & 0b00111111) << 6) |
|
|
(data[pos + 2] & 0b00111111));
|
|
if (!match_system(big_endian)) {
|
|
code_point = u16_swap_bytes(uint16_t(code_point));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if (pos + 3 >= len) {
|
|
break;
|
|
} // minimal bound checking
|
|
uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
|
|
((data[pos + 1] & 0b00111111) << 12) |
|
|
((data[pos + 2] & 0b00111111) << 6) |
|
|
(data[pos + 3] & 0b00111111);
|
|
code_point -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = u16_swap_bytes(high_surrogate);
|
|
low_surrogate = u16_swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos += 4;
|
|
} else {
|
|
// we may have a continuation but we do not do error checking
|
|
return 0;
|
|
}
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
|
|
#ifndef SIMDUTF_UTF8_TO_UTF16_H
|
|
#define SIMDUTF_UTF8_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(buf[pos]))
|
|
: char16_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(leading_byte))
|
|
: char16_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if (pos + 1 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) {
|
|
return 0;
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if (pos + 2 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if (code_point < 0x800 || 0xffff < code_point ||
|
|
(0xd7ff < code_point && code_point < 0xe000)) {
|
|
return 0;
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if (pos + 3 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00000111) << 18 |
|
|
(data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 |
|
|
(data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff || 0x10ffff < code_point) {
|
|
return 0;
|
|
}
|
|
code_point -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = u16_swap_bytes(high_surrogate);
|
|
low_surrogate = u16_swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos += 4;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(buf[pos]))
|
|
: char16_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(u16_swap_bytes(leading_byte))
|
|
: char16_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if (pos + 1 >= len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if (pos + 2 >= len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if ((code_point < 0x800) || (0xffff < code_point)) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (0xd7ff < code_point && code_point < 0xe000) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if (pos + 3 >= len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00000111) << 18 |
|
|
(data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 |
|
|
(data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (0x10ffff < code_point) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
code_point -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = u16_swap_bytes(high_surrogate);
|
|
low_surrogate = u16_swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos += 4;
|
|
} else {
|
|
// we either have too many continuation bytes or an invalid leading byte
|
|
if ((leading_byte & 0b11000000) == 0b10000000) {
|
|
return result(error_code::TOO_LONG, pos);
|
|
} else {
|
|
return result(error_code::HEADER_BITS, pos);
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
/**
|
|
* When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
|
|
* we have up to len input bytes left, and we encountered some error. It is
|
|
* possible that the error is at 'buf' exactly, but it could also be in the
|
|
* previous bytes (up to 3 bytes back).
|
|
*
|
|
* prior_bytes indicates how many bytes, prior to 'buf' may belong to the
|
|
* current memory section and can be safely accessed. We prior_bytes to access
|
|
* safely up to three bytes before 'buf'.
|
|
*
|
|
* The caller is responsible to ensure that len > 0.
|
|
*
|
|
* If the error is believed to have occurred prior to 'buf', the count value
|
|
* contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
|
|
*/
|
|
template <endianness endian>
|
|
inline result rewind_and_convert_with_errors(size_t prior_bytes,
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
size_t extra_len{0};
|
|
// We potentially need to go back in time and find a leading byte.
|
|
// In theory '3' would be sufficient, but sometimes the error can go back
|
|
// quite far.
|
|
size_t how_far_back = prior_bytes;
|
|
// size_t how_far_back = 3; // 3 bytes in the past + current position
|
|
// if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
|
|
bool found_leading_bytes{false};
|
|
// important: it is i <= how_far_back and not 'i < how_far_back'.
|
|
for (size_t i = 0; i <= how_far_back; i++) {
|
|
unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
|
|
found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
|
|
if (found_leading_bytes) {
|
|
if (i > 0 && byte < 128) {
|
|
// If we had to go back and the leading byte is ascii
|
|
// then we can stop right away.
|
|
return result(error_code::TOO_LONG, 0 - i + 1);
|
|
}
|
|
buf -= i;
|
|
extra_len = i;
|
|
break;
|
|
}
|
|
}
|
|
//
|
|
// It is possible for this function to return a negative count in its result.
|
|
// C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
|
|
// in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
|
|
// unsigned integral type of the result of the sizeof operator
|
|
//
|
|
// An unsigned type will simply wrap round arithmetically (well defined).
|
|
//
|
|
if (!found_leading_bytes) {
|
|
// If how_far_back == 3, we may have four consecutive continuation bytes!!!
|
|
// [....] [continuation] [continuation] [continuation] | [buf is
|
|
// continuation] Or we possibly have a stream that does not start with a
|
|
// leading byte.
|
|
return result(error_code::TOO_LONG, 0 - how_far_back);
|
|
}
|
|
result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
|
|
if (res.error) {
|
|
res.count -= extra_len;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 ||
|
|
// SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1)
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
|
|
#define SIMDUTF_VALID_UTF8_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
inline size_t convert_valid(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 ASCII bytes
|
|
if (pos + 8 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 8;
|
|
while (pos < final_pos) {
|
|
*utf32_output++ = char32_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf32_output++ = char32_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8
|
|
if (pos + 1 >= len) {
|
|
break;
|
|
} // minimal bound checking
|
|
*utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
|
|
(data[pos + 1] & 0b00111111));
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8
|
|
if (pos + 2 >= len) {
|
|
break;
|
|
} // minimal bound checking
|
|
*utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
|
|
((data[pos + 1] & 0b00111111) << 6) |
|
|
(data[pos + 2] & 0b00111111));
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if (pos + 3 >= len) {
|
|
break;
|
|
} // minimal bound checking
|
|
uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
|
|
((data[pos + 1] & 0b00111111) << 12) |
|
|
((data[pos + 2] & 0b00111111) << 6) |
|
|
(data[pos + 3] & 0b00111111);
|
|
*utf32_output++ = char32_t(code_word);
|
|
pos += 4;
|
|
} else {
|
|
// we may have a continuation but we do not do error checking
|
|
return 0;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
|
|
#ifndef SIMDUTF_UTF8_TO_UTF32_H
|
|
#define SIMDUTF_UTF8_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*utf32_output++ = char32_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf32_output++ = char32_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8
|
|
if (pos + 1 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) {
|
|
return 0;
|
|
}
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8
|
|
if (pos + 2 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if (code_point < 0x800 || 0xffff < code_point ||
|
|
(0xd7ff < code_point && code_point < 0xe000)) {
|
|
return 0;
|
|
}
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if (pos + 3 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
}
|
|
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00000111) << 18 |
|
|
(data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 |
|
|
(data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff || 0x10ffff < code_point) {
|
|
return 0;
|
|
}
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 4;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
inline result convert_with_errors(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*utf32_output++ = char32_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf32_output++ = char32_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8
|
|
if (pos + 1 >= len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8
|
|
if (pos + 2 >= len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if (code_point < 0x800 || 0xffff < code_point) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (0xd7ff < code_point && code_point < 0xe000) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if (pos + 3 >= len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
}
|
|
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00000111) << 18 |
|
|
(data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 |
|
|
(data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (0x10ffff < code_point) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 4;
|
|
} else {
|
|
// we either have too many continuation bytes or an invalid leading byte
|
|
if ((leading_byte & 0b11000000) == 0b10000000) {
|
|
return result(error_code::TOO_LONG, pos);
|
|
} else {
|
|
return result(error_code::HEADER_BITS, pos);
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
/**
|
|
* When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
|
|
* we have up to len input bytes left, and we encountered some error. It is
|
|
* possible that the error is at 'buf' exactly, but it could also be in the
|
|
* previous bytes location (up to 3 bytes back).
|
|
*
|
|
* prior_bytes indicates how many bytes, prior to 'buf' may belong to the
|
|
* current memory section and can be safely accessed. We prior_bytes to access
|
|
* safely up to three bytes before 'buf'.
|
|
*
|
|
* The caller is responsible to ensure that len > 0.
|
|
*
|
|
* If the error is believed to have occurred prior to 'buf', the count value
|
|
* contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
|
|
*/
|
|
inline result rewind_and_convert_with_errors(size_t prior_bytes,
|
|
const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
size_t extra_len{0};
|
|
// We potentially need to go back in time and find a leading byte.
|
|
size_t how_far_back = 3; // 3 bytes in the past + current position
|
|
if (how_far_back > prior_bytes) {
|
|
how_far_back = prior_bytes;
|
|
}
|
|
bool found_leading_bytes{false};
|
|
// important: it is i <= how_far_back and not 'i < how_far_back'.
|
|
for (size_t i = 0; i <= how_far_back; i++) {
|
|
unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
|
|
found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
|
|
if (found_leading_bytes) {
|
|
if (i > 0 && byte < 128) {
|
|
// If we had to go back and the leading byte is ascii
|
|
// then we can stop right away.
|
|
return result(error_code::TOO_LONG, 0 - i + 1);
|
|
}
|
|
buf -= i;
|
|
extra_len = i;
|
|
break;
|
|
}
|
|
}
|
|
//
|
|
// It is possible for this function to return a negative count in its result.
|
|
// C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
|
|
// in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
|
|
// unsigned integral type of the result of the sizeof operator
|
|
//
|
|
// An unsigned type will simply wrap round arithmetically (well defined).
|
|
//
|
|
if (!found_leading_bytes) {
|
|
// If how_far_back == 3, we may have four consecutive continuation bytes!!!
|
|
// [....] [continuation] [continuation] [continuation] | [buf is
|
|
// continuation] Or we possibly have a stream that does not start with a
|
|
// leading byte.
|
|
return result(error_code::TOO_LONG, 0 - how_far_back);
|
|
}
|
|
|
|
result res = convert_with_errors(buf, len + extra_len, utf32_output);
|
|
if (res.error) {
|
|
res.count -= extra_len;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
|
|
#ifndef SIMDUTF_LATIN1_TO_UTF8_H
|
|
#define SIMDUTF_LATIN1_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace latin1_to_utf8 {
|
|
|
|
inline size_t convert(const char *buf, size_t len, char *utf8_output) {
|
|
const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
|
|
size_t pos = 0;
|
|
size_t utf8_pos = 0;
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 |
|
|
v2}; // We are only interested in these bits: 1000 1000 1000
|
|
// 1000, so it makes sense to concatenate everything
|
|
if ((v & 0x8080808080808080) ==
|
|
0) { // if NONE of these are set, e.g. all of them are zero, then
|
|
// everything is ASCII
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
utf8_output[utf8_pos++] = char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
unsigned char byte = data[pos];
|
|
if ((byte & 0x80) == 0) { // if ASCII
|
|
// will generate one UTF-8 bytes
|
|
utf8_output[utf8_pos++] = char(byte);
|
|
pos++;
|
|
} else {
|
|
// will generate two UTF-8 bytes
|
|
utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
|
|
utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
|
|
pos++;
|
|
}
|
|
}
|
|
return utf8_pos;
|
|
}
|
|
|
|
inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
|
|
size_t utf8_len) {
|
|
const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
|
|
size_t pos = 0;
|
|
size_t skip_pos = 0;
|
|
size_t utf8_pos = 0;
|
|
while (pos < len && utf8_pos < utf8_len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos >= skip_pos && pos + 16 <= len &&
|
|
utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
|
|
// check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 |
|
|
v2}; // We are only interested in these bits: 1000 1000 1000
|
|
// 1000, so it makes sense to concatenate everything
|
|
if ((v & 0x8080808080808080) ==
|
|
0) { // if NONE of these are set, e.g. all of them are zero, then
|
|
// everything is ASCII
|
|
::memcpy(utf8_output + utf8_pos, buf + pos, 16);
|
|
utf8_pos += 16;
|
|
pos += 16;
|
|
} else {
|
|
// At least one of the next 16 bytes are not ASCII, we will process them
|
|
// one by one
|
|
skip_pos = pos + 16;
|
|
}
|
|
} else {
|
|
const auto byte = data[pos];
|
|
if ((byte & 0x80) == 0) { // if ASCII
|
|
// will generate one UTF-8 bytes
|
|
utf8_output[utf8_pos++] = char(byte);
|
|
pos++;
|
|
} else if (utf8_pos + 2 <= utf8_len) {
|
|
// will generate two UTF-8 bytes
|
|
utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
|
|
utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return utf8_pos;
|
|
}
|
|
|
|
} // namespace latin1_to_utf8
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
|
|
#ifndef SIMDUTF_LATIN1_TO_UTF16_H
|
|
#define SIMDUTF_LATIN1_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace latin1_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
|
|
while (pos < len) {
|
|
uint16_t word =
|
|
uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
|
|
*utf16_output++ =
|
|
char16_t(match_system(big_endian) ? word : u16_swap_bytes(word));
|
|
pos++;
|
|
}
|
|
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
|
|
while (pos < len) {
|
|
uint16_t word =
|
|
uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
|
|
*utf16_output++ =
|
|
char16_t(match_system(big_endian) ? word : u16_swap_bytes(word));
|
|
pos++;
|
|
}
|
|
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
} // namespace latin1_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
|
|
#ifndef SIMDUTF_LATIN1_TO_UTF32_H
|
|
#define SIMDUTF_LATIN1_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace latin1_to_utf32 {
|
|
|
|
inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
|
|
const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
|
|
char32_t *start{utf32_output};
|
|
for (size_t i = 0; i < len; i++) {
|
|
*utf32_output++ = (char32_t)data[i];
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace latin1_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
|
|
#ifndef SIMDUTF_UTF8_TO_LATIN1_H
|
|
#define SIMDUTF_UTF8_TO_LATIN1_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
|
|
inline size_t convert(const char *buf, size_t len, char *latin_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{latin_output};
|
|
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
|
|
// 1000 1000 .... etc
|
|
if ((v & 0x8080808080808080) ==
|
|
0) { // if NONE of these are set, e.g. all of them are zero, then
|
|
// everything is ASCII
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*latin_output++ = char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// suppose it is not an all ASCII byte sequence
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*latin_output++ = char(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) ==
|
|
0b11000000) { // the first three bits indicate:
|
|
// We have a two-byte UTF-8
|
|
if (pos + 1 >= len) {
|
|
return 0;
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
} // checks if the next byte is a valid continuation byte in UTF-8. A
|
|
// valid continuation byte starts with 10.
|
|
// range check -
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00011111) << 6 |
|
|
(data[pos + 1] &
|
|
0b00111111); // assembles the Unicode code point from the two bytes.
|
|
// It does this by discarding the leading 110 and 10
|
|
// bits from the two bytes, shifting the remaining bits
|
|
// of the first byte, and then combining the results
|
|
// with a bitwise OR operation.
|
|
if (code_point < 0x80 || 0xFF < code_point) {
|
|
return 0; // We only care about the range 129-255 which is Non-ASCII
|
|
// latin1 characters. A code_point beneath 0x80 is invalid as
|
|
// it is already covered by bytes whose leading bit is zero.
|
|
}
|
|
*latin_output++ = char(code_point);
|
|
pos += 2;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
return latin_output - start;
|
|
}
|
|
|
|
inline result convert_with_errors(const char *buf, size_t len,
|
|
char *latin_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{latin_output};
|
|
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
|
|
// 1000 1000...etc
|
|
if ((v & 0x8080808080808080) ==
|
|
0) { // if NONE of these are set, e.g. all of them are zero, then
|
|
// everything is ASCII
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*latin_output++ = char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
// suppose it is not an all ASCII byte sequence
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*latin_output++ = char(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) ==
|
|
0b11000000) { // the first three bits indicate:
|
|
// We have a two-byte UTF-8
|
|
if (pos + 1 >= len) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return result(error_code::TOO_SHORT, pos);
|
|
} // checks if the next byte is a valid continuation byte in UTF-8. A
|
|
// valid continuation byte starts with 10.
|
|
// range check -
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00011111) << 6 |
|
|
(data[pos + 1] &
|
|
0b00111111); // assembles the Unicode code point from the two bytes.
|
|
// It does this by discarding the leading 110 and 10
|
|
// bits from the two bytes, shifting the remaining bits
|
|
// of the first byte, and then combining the results
|
|
// with a bitwise OR operation.
|
|
if (code_point < 0x80) {
|
|
return result(error_code::OVERLONG, pos);
|
|
}
|
|
if (0xFF < code_point) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
} // We only care about the range 129-255 which is Non-ASCII latin1
|
|
// characters
|
|
*latin_output++ = char(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8
|
|
return result(error_code::TOO_LARGE, pos);
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
return result(error_code::TOO_LARGE, pos);
|
|
} else {
|
|
// we either have too many continuation bytes or an invalid leading byte
|
|
if ((leading_byte & 0b11000000) == 0b10000000) {
|
|
return result(error_code::TOO_LONG, pos);
|
|
}
|
|
|
|
return result(error_code::HEADER_BITS, pos);
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin_output - start);
|
|
}
|
|
|
|
inline result rewind_and_convert_with_errors(size_t prior_bytes,
|
|
const char *buf, size_t len,
|
|
char *latin1_output) {
|
|
size_t extra_len{0};
|
|
// We potentially need to go back in time and find a leading byte.
|
|
// In theory '3' would be sufficient, but sometimes the error can go back
|
|
// quite far.
|
|
size_t how_far_back = prior_bytes;
|
|
// size_t how_far_back = 3; // 3 bytes in the past + current position
|
|
// if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
|
|
bool found_leading_bytes{false};
|
|
// important: it is i <= how_far_back and not 'i < how_far_back'.
|
|
for (size_t i = 0; i <= how_far_back; i++) {
|
|
unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
|
|
found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
|
|
if (found_leading_bytes) {
|
|
if (i > 0 && byte < 128) {
|
|
// If we had to go back and the leading byte is ascii
|
|
// then we can stop right away.
|
|
return result(error_code::TOO_LONG, 0 - i + 1);
|
|
}
|
|
buf -= i;
|
|
extra_len = i;
|
|
break;
|
|
}
|
|
}
|
|
//
|
|
// It is possible for this function to return a negative count in its result.
|
|
// C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
|
|
// in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
|
|
// unsigned integral type of the result of the sizeof operator
|
|
//
|
|
// An unsigned type will simply wrap round arithmetically (well defined).
|
|
//
|
|
if (!found_leading_bytes) {
|
|
// If how_far_back == 3, we may have four consecutive continuation bytes!!!
|
|
// [....] [continuation] [continuation] [continuation] | [buf is
|
|
// continuation] Or we possibly have a stream that does not start with a
|
|
// leading byte.
|
|
return result(error_code::TOO_LONG, 0 - how_far_back);
|
|
}
|
|
result res = convert_with_errors(buf, len + extra_len, latin1_output);
|
|
if (res.error) {
|
|
res.count -= extra_len;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
|
|
#ifndef SIMDUTF_UTF16_TO_LATIN1_H
|
|
#define SIMDUTF_UTF16_TO_LATIN1_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_latin1 {
|
|
|
|
#include <cstring> // for std::memcpy
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char16_t *buf, size_t len, char *latin_output) {
|
|
if (len == 0) {
|
|
return 0;
|
|
}
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char *current_write = latin_output;
|
|
uint16_t word = 0;
|
|
uint16_t too_large = 0;
|
|
|
|
while (pos < len) {
|
|
word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
too_large |= word;
|
|
*current_write++ = char(word & 0xFF);
|
|
pos++;
|
|
}
|
|
if ((too_large & 0xFF00) != 0) {
|
|
return 0;
|
|
}
|
|
|
|
return current_write - latin_output;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char16_t *buf, size_t len,
|
|
char *latin_output) {
|
|
if (len == 0) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{latin_output};
|
|
uint16_t word;
|
|
|
|
while (pos < len) {
|
|
if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that
|
|
// they are Latin1
|
|
uint64_t v1, v2, v3, v4;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
|
|
::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
|
|
::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
|
|
|
|
if (!match_system(big_endian)) {
|
|
v1 = (v1 >> 8) | (v1 << (64 - 8));
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
v2 = (v2 >> 8) | (v2 << (64 - 8));
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
v3 = (v3 >> 8) | (v3 << (64 - 8));
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
v4 = (v4 >> 8) | (v4 << (64 - 8));
|
|
}
|
|
|
|
if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*latin_output++ = !match_system(big_endian)
|
|
? char(u16_swap_bytes(data[pos]))
|
|
: char(data[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
if ((word & 0xFF00) == 0) {
|
|
*latin_output++ = char(word & 0xFF);
|
|
pos++;
|
|
} else {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin_output - start);
|
|
}
|
|
|
|
} // namespace utf16_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
|
|
#ifndef SIMDUTF_UTF32_TO_LATIN1_H
|
|
#define SIMDUTF_UTF32_TO_LATIN1_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_latin1 {
|
|
|
|
inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
char *start = latin1_output;
|
|
uint32_t utf32_char;
|
|
size_t pos = 0;
|
|
uint32_t too_large = 0;
|
|
|
|
while (pos < len) {
|
|
utf32_char = (uint32_t)data[pos];
|
|
too_large |= utf32_char;
|
|
*latin1_output++ = (char)(utf32_char & 0xFF);
|
|
pos++;
|
|
}
|
|
if ((too_large & 0xFFFFFF00) != 0) {
|
|
return 0;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
inline result convert_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
char *start{latin1_output};
|
|
size_t pos = 0;
|
|
while (pos < len) {
|
|
if (pos + 2 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are Latin1
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF00FFFFFF00) == 0) {
|
|
*latin1_output++ = char(buf[pos]);
|
|
*latin1_output++ = char(buf[pos + 1]);
|
|
pos += 2;
|
|
continue;
|
|
}
|
|
}
|
|
uint32_t utf32_char = data[pos];
|
|
if ((utf32_char & 0xFFFFFF00) ==
|
|
0) { // Check if the character can be represented in Latin-1
|
|
*latin1_output++ = (char)(utf32_char & 0xFF);
|
|
pos++;
|
|
} else {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
};
|
|
}
|
|
return result(error_code::SUCCESS, latin1_output - start);
|
|
}
|
|
|
|
} // namespace utf32_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
|
|
#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
|
|
inline size_t convert_valid(const char *buf, size_t len, char *latin_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
|
|
size_t pos = 0;
|
|
char *start{latin_output};
|
|
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <=
|
|
len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 |
|
|
v2}; // We are only interested in these bits: 1000 1000 1000
|
|
// 1000, so it makes sense to concatenate everything
|
|
if ((v & 0x8080808080808080) ==
|
|
0) { // if NONE of these are set, e.g. all of them are zero, then
|
|
// everything is ASCII
|
|
size_t final_pos = pos + 16;
|
|
while (pos < final_pos) {
|
|
*latin_output++ = char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// suppose it is not an all ASCII byte sequence
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*latin_output++ = char(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) ==
|
|
0b11000000) { // the first three bits indicate:
|
|
// We have a two-byte UTF-8
|
|
if (pos + 1 >= len) {
|
|
break;
|
|
} // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
|
return 0;
|
|
} // checks if the next byte is a valid continuation byte in UTF-8. A
|
|
// valid continuation byte starts with 10.
|
|
// range check -
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00011111) << 6 |
|
|
(data[pos + 1] &
|
|
0b00111111); // assembles the Unicode code point from the two bytes.
|
|
// It does this by discarding the leading 110 and 10
|
|
// bits from the two bytes, shifting the remaining bits
|
|
// of the first byte, and then combining the results
|
|
// with a bitwise OR operation.
|
|
*latin_output++ = char(code_point);
|
|
pos += 2;
|
|
} else {
|
|
// we may have a continuation but we do not do error checking
|
|
return 0;
|
|
}
|
|
}
|
|
return latin_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
|
|
#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
|
|
#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_latin1 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char16_t *buf, size_t len,
|
|
char *latin_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char *start{latin_output};
|
|
uint16_t word = 0;
|
|
|
|
while (pos < len) {
|
|
word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
|
|
*latin_output++ = char(word);
|
|
pos++;
|
|
}
|
|
|
|
return latin_output - start;
|
|
}
|
|
|
|
} // namespace utf16_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
|
|
#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
|
|
#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_latin1 {
|
|
|
|
inline size_t convert_valid(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
char *start = latin1_output;
|
|
uint32_t utf32_char;
|
|
size_t pos = 0;
|
|
|
|
while (pos < len) {
|
|
utf32_char = (uint32_t)data[pos];
|
|
|
|
if (pos + 2 <=
|
|
len) { // if it is safe to read 8 more bytes, check that they are Latin1
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF00FFFFFF00) == 0) {
|
|
*latin1_output++ = char(buf[pos]);
|
|
*latin1_output++ = char(buf[pos + 1]);
|
|
pos += 2;
|
|
continue;
|
|
} else {
|
|
// output can not be represented in latin1
|
|
return 0;
|
|
}
|
|
}
|
|
if ((utf32_char & 0xFFFFFF00) == 0) {
|
|
*latin1_output++ = char(utf32_char);
|
|
} else {
|
|
// output can not be represented in latin1
|
|
return 0;
|
|
}
|
|
pos++;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
} // namespace utf32_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
/* begin file src/implementation.cpp */
|
|
#include <initializer_list>
|
|
#include <climits>
|
|
#include <type_traits>
|
|
|
|
static_assert(sizeof(uint8_t) == sizeof(char),
|
|
"simdutf requires that uint8_t be a char");
|
|
static_assert(sizeof(uint16_t) == sizeof(char16_t),
|
|
"simdutf requires that char16_t be 16 bits");
|
|
static_assert(sizeof(uint32_t) == sizeof(char32_t),
|
|
"simdutf requires that char32_t be 32 bits");
|
|
// next line is redundant, but it is kept to catch defective systems.
|
|
static_assert(CHAR_BIT == 8, "simdutf requires 8-bit bytes");
|
|
|
|
// Useful for debugging purposes
|
|
namespace simdutf {
|
|
namespace {
|
|
|
|
template <typename T> std::string toBinaryString(T b) {
|
|
std::string binary = "";
|
|
T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
|
|
while (mask > 0) {
|
|
binary += ((b & mask) == 0) ? '0' : '1';
|
|
mask >>= 1;
|
|
}
|
|
return binary;
|
|
}
|
|
} // namespace
|
|
} // namespace simdutf
|
|
|
|
namespace simdutf {
|
|
bool implementation::supported_by_runtime_system() const {
|
|
uint32_t required_instruction_sets = this->required_instruction_sets();
|
|
uint32_t supported_instruction_sets =
|
|
internal::detect_supported_architectures();
|
|
return ((supported_instruction_sets & required_instruction_sets) ==
|
|
required_instruction_sets);
|
|
}
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused encoding_type implementation::autodetect_encoding(
|
|
const char *input, size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
// UTF8 is common, it includes ASCII, and is commonly represented
|
|
// without a BOM, so if it fits, go with that. Note that it is still
|
|
// possible to get it wrong, we are only 'guessing'. If some has UTF-16
|
|
// data without a BOM, it could pass as UTF-8.
|
|
//
|
|
// An interesting twist might be to check for UTF-16 ASCII first (every
|
|
// other byte is zero).
|
|
if (validate_utf8(input, length)) {
|
|
return encoding_type::UTF8;
|
|
}
|
|
// The next most common encoding that might appear without BOM is probably
|
|
// UTF-16LE, so try that next.
|
|
if ((length % 2) == 0) {
|
|
// important: we need to divide by two
|
|
if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
|
|
length / 2)) {
|
|
return encoding_type::UTF16_LE;
|
|
}
|
|
}
|
|
if ((length % 4) == 0) {
|
|
if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
|
|
return encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
return encoding_type::unspecified;
|
|
}
|
|
|
|
#ifdef SIMDUTF_INTERNAL_TESTS
|
|
std::vector<implementation::TestProcedure>
|
|
implementation::internal_tests() const {
|
|
return {};
|
|
}
|
|
#endif
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
|
|
const char *input, size_t length) const noexcept {
|
|
return scalar::base64::maximal_binary_length_from_base64(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return scalar::base64::maximal_binary_length_from_base64(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::base64_length_from_binary(
|
|
size_t length, base64_options options) const noexcept {
|
|
return scalar::base64::base64_length_from_binary(length, options);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
namespace internal {
|
|
// When there is a single implementation, we should not pay a price
|
|
// for dispatching to the best implementation. We should just use the
|
|
// one we have. This is a compile-time check.
|
|
#define SIMDUTF_SINGLE_IMPLEMENTATION \
|
|
(SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL + \
|
|
SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 + \
|
|
SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_LSX + \
|
|
SIMDUTF_IMPLEMENTATION_LASX + SIMDUTF_IMPLEMENTATION_FALLBACK == \
|
|
1)
|
|
|
|
// Static array of known implementations. We are hoping these get baked into the
|
|
// executable without requiring a static initializer.
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
static const icelake::implementation *get_icelake_singleton() {
|
|
static const icelake::implementation icelake_singleton{};
|
|
return &icelake_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
static const haswell::implementation *get_haswell_singleton() {
|
|
static const haswell::implementation haswell_singleton{};
|
|
return &haswell_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
static const westmere::implementation *get_westmere_singleton() {
|
|
static const westmere::implementation westmere_singleton{};
|
|
return &westmere_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
static const arm64::implementation *get_arm64_singleton() {
|
|
static const arm64::implementation arm64_singleton{};
|
|
return &arm64_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
static const ppc64::implementation *get_ppc64_singleton() {
|
|
static const ppc64::implementation ppc64_singleton{};
|
|
return &ppc64_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_RVV
|
|
static const rvv::implementation *get_rvv_singleton() {
|
|
static const rvv::implementation rvv_singleton{};
|
|
return &rvv_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LSX
|
|
static const lsx::implementation *get_lsx_singleton() {
|
|
static const lsx::implementation lsx_singleton{};
|
|
return &lsx_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LASX
|
|
static const lasx::implementation *get_lasx_singleton() {
|
|
static const lasx::implementation lasx_singleton{};
|
|
return &lasx_singleton;
|
|
}
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
static const fallback::implementation *get_fallback_singleton() {
|
|
static const fallback::implementation fallback_singleton{};
|
|
return &fallback_singleton;
|
|
}
|
|
#endif
|
|
|
|
#if SIMDUTF_SINGLE_IMPLEMENTATION
|
|
static const implementation *get_single_implementation() {
|
|
return
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
get_icelake_singleton();
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
get_haswell_singleton();
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
get_westmere_singleton();
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
get_arm64_singleton();
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
get_ppc64_singleton();
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LSX
|
|
get_lsx_singleton();
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LASX
|
|
get_lasx_singleton();
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
get_fallback_singleton();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* @private Detects best supported implementation on first use, and sets it
|
|
*/
|
|
class detect_best_supported_implementation_on_first_use final
|
|
: public implementation {
|
|
public:
|
|
std::string name() const noexcept final { return set_best()->name(); }
|
|
std::string description() const noexcept final {
|
|
return set_best()->description();
|
|
}
|
|
uint32_t required_instruction_sets() const noexcept final {
|
|
return set_best()->required_instruction_sets();
|
|
}
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
detect_encodings(const char *input, size_t length) const noexcept override {
|
|
return set_best()->detect_encodings(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
validate_utf8(const char *buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
validate_ascii(const char *buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_ascii_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16le(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16be(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16le_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16be_with_errors(buf, len);
|
|
}
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final override {
|
|
return set_best()->to_well_formed_utf16be(input, len, output);
|
|
}
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept final override {
|
|
return set_best()->to_well_formed_utf16le(input, len, output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
validate_utf32(const char32_t *buf,
|
|
size_t len) const noexcept final override {
|
|
return set_best()->validate_utf32(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf32_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_latin1_to_utf8(const char *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len,
|
|
char32_t *latin1_output) const noexcept final override {
|
|
return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf8_to_latin1(const char *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_latin1_with_errors(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16le_with_errors(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16be_with_errors(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf8_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf32_with_errors(buf, len,
|
|
utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_latin1_with_errors(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_latin1_with_errors(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
convert_utf16le_to_utf8(const char16_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
convert_utf16be_to_utf8(const char16_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf8_with_errors(buf, len,
|
|
utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf8_with_errors(buf, len,
|
|
utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_latin1_with_errors(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len,
|
|
char *latin1_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
convert_utf32_to_utf8(const char32_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
convert_valid_utf32_to_utf8(const char32_t *buf, size_t len,
|
|
char *utf8_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16le_with_errors(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16be_with_errors(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf32_with_errors(buf, len,
|
|
utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf32_with_errors(buf, len,
|
|
utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *buf, size_t len,
|
|
char16_t *output) const noexcept final override {
|
|
set_best()->change_endianness_utf16(buf, len, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
count_utf16le(const char16_t *buf, size_t len) const noexcept final override {
|
|
return set_best()->count_utf16le(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
count_utf16be(const char16_t *buf, size_t len) const noexcept final override {
|
|
return set_best()->count_utf16be(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
count_utf8(const char *buf, size_t len) const noexcept final override {
|
|
return set_best()->count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *buf, size_t len) const noexcept override {
|
|
return set_best()->latin1_length_from_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *buf, size_t len) const noexcept override {
|
|
return set_best()->utf8_length_from_latin1(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(
|
|
const char16_t *buf, size_t len) const noexcept override {
|
|
return set_best()->utf8_length_from_utf16le(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(
|
|
const char16_t *buf, size_t len) const noexcept override {
|
|
return set_best()->utf8_length_from_utf16be(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(
|
|
const char16_t *buf, size_t len) const noexcept override {
|
|
return set_best()->utf32_length_from_utf16le(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(
|
|
const char16_t *buf, size_t len) const noexcept override {
|
|
return set_best()->utf32_length_from_utf16be(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *buf, size_t len) const noexcept override {
|
|
return set_best()->utf16_length_from_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(
|
|
const char32_t *buf, size_t len) const noexcept override {
|
|
return set_best()->utf8_length_from_utf32(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(
|
|
const char32_t *buf, size_t len) const noexcept override {
|
|
return set_best()->utf16_length_from_utf32(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *buf, size_t len) const noexcept override {
|
|
return set_best()->utf32_length_from_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options =
|
|
last_chunk_handling_options::loose) const noexcept override {
|
|
return set_best()->base64_to_binary(input, length, output, options,
|
|
last_chunk_handling_options);
|
|
}
|
|
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options =
|
|
last_chunk_handling_options::loose) const noexcept override {
|
|
return set_best()->base64_to_binary_details(input, length, output, options,
|
|
last_chunk_handling_options);
|
|
}
|
|
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options =
|
|
last_chunk_handling_options::loose) const noexcept override {
|
|
return set_best()->base64_to_binary(input, length, output, options,
|
|
last_chunk_handling_options);
|
|
}
|
|
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options =
|
|
last_chunk_handling_options::loose) const noexcept override {
|
|
return set_best()->base64_to_binary_details(input, length, output, options,
|
|
last_chunk_handling_options);
|
|
}
|
|
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) const noexcept override {
|
|
return set_best()->binary_to_base64(input, length, output, options);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
simdutf_really_inline
|
|
detect_best_supported_implementation_on_first_use() noexcept
|
|
: implementation("best_supported_detector",
|
|
"Detects the best supported implementation and sets it",
|
|
0) {}
|
|
|
|
private:
|
|
const implementation *set_best() const noexcept;
|
|
};
|
|
|
|
static_assert(std::is_trivially_destructible<
|
|
detect_best_supported_implementation_on_first_use>::value,
|
|
"detect_best_supported_implementation_on_first_use should be "
|
|
"trivially destructible");
|
|
|
|
static const std::initializer_list<const implementation *> &
|
|
get_available_implementation_pointers() {
|
|
static const std::initializer_list<const implementation *>
|
|
available_implementation_pointers{
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
get_icelake_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
get_haswell_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
get_westmere_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
get_arm64_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
get_ppc64_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_RVV
|
|
get_rvv_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LSX
|
|
get_lsx_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LASX
|
|
get_lasx_singleton(),
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
get_fallback_singleton(),
|
|
#endif
|
|
}; // available_implementation_pointers
|
|
return available_implementation_pointers;
|
|
}
|
|
|
|
// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no
|
|
// support
|
|
class unsupported_implementation final : public implementation {
|
|
public:
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int detect_encodings(const char *,
|
|
size_t) const noexcept override {
|
|
return encoding_type::unspecified;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf8(const char *,
|
|
size_t) const noexcept final override {
|
|
return false; // Just refuse to validate. Given that we have a fallback
|
|
// implementation
|
|
// it seems unlikely that unsupported_implementation will ever be used. If
|
|
// it is used, then it will flag all strings as invalid. The alternative is
|
|
// to return an error_code from which the user has to figure out whether the
|
|
// string is valid UTF-8... which seems like a lot of work just to handle
|
|
// the very unlikely case that we have an unsupported implementation. And,
|
|
// when it does happen (that we have an unsupported implementation), what
|
|
// are the chances that the programmer has a fallback? Given that *we*
|
|
// provide the fallback, it implies that the programmer would need a
|
|
// fallback for our fallback.
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result validate_utf8_with_errors(
|
|
const char *, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
validate_ascii(const char *, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
|
|
simdutf_warn_unused result validate_ascii_with_errors(
|
|
const char *, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
validate_utf16le(const char16_t *, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
validate_utf16be(const char16_t *, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16le_with_errors(
|
|
const char16_t *, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16be_with_errors(
|
|
const char16_t *, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
void to_well_formed_utf16be(const char16_t *, size_t,
|
|
char16_t *) const noexcept final override {}
|
|
void to_well_formed_utf16le(const char16_t *, size_t,
|
|
char16_t *) const noexcept final override {}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
validate_utf32(const char32_t *, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result validate_utf32_with_errors(
|
|
const char32_t *, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(
|
|
const char *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *, size_t, char32_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *, size_t, char *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *, size_t, char32_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *, size_t, char32_t *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *, size_t, char32_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf16le_to_latin1(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_latin1(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf32_to_latin1(
|
|
const char32_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
|
|
const char32_t *, size_t, char *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
|
|
const char32_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(
|
|
const char32_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *, size_t, char *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *, size_t, char *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(
|
|
const char32_t *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(
|
|
const char32_t *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *, size_t, char16_t *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *, size_t, char16_t *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
|
|
const char32_t *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
|
|
const char32_t *, size_t, char16_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(
|
|
const char16_t *, size_t, char32_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(
|
|
const char16_t *, size_t, char32_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *, size_t, char32_t *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *, size_t, char32_t *) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
|
|
const char16_t *, size_t, char32_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
|
|
const char16_t *, size_t, char32_t *) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *, size_t,
|
|
char16_t *) const noexcept final override {}
|
|
|
|
simdutf_warn_unused size_t
|
|
count_utf16le(const char16_t *, size_t) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
count_utf16be(const char16_t *, size_t) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t count_utf8(const char *,
|
|
size_t) const noexcept final override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
latin1_length_from_utf8(const char *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_latin1(const char *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf8(const char *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t
|
|
utf32_length_from_utf8(const char *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char *, size_t, char *, base64_options,
|
|
last_chunk_handling_options) const noexcept override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char *, size_t, char *, base64_options,
|
|
last_chunk_handling_options) const noexcept override {
|
|
return full_result(error_code::OTHER, 0, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
base64_to_binary(const char16_t *, size_t, char *, base64_options,
|
|
last_chunk_handling_options) const noexcept override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused full_result base64_to_binary_details(
|
|
const char16_t *, size_t, char *, base64_options,
|
|
last_chunk_handling_options) const noexcept override {
|
|
return full_result(error_code::OTHER, 0, 0);
|
|
}
|
|
|
|
size_t binary_to_base64(const char *, size_t, char *,
|
|
base64_options) const noexcept override {
|
|
return 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
unsupported_implementation()
|
|
: implementation("unsupported",
|
|
"Unsupported CPU (no detected SIMD instructions)", 0) {}
|
|
};
|
|
|
|
const unsupported_implementation *get_unsupported_singleton() {
|
|
static const unsupported_implementation unsupported_singleton{};
|
|
return &unsupported_singleton;
|
|
}
|
|
static_assert(std::is_trivially_destructible<unsupported_implementation>::value,
|
|
"unsupported_singleton should be trivially destructible");
|
|
|
|
size_t available_implementation_list::size() const noexcept {
|
|
return internal::get_available_implementation_pointers().size();
|
|
}
|
|
const implementation *const *
|
|
available_implementation_list::begin() const noexcept {
|
|
return internal::get_available_implementation_pointers().begin();
|
|
}
|
|
const implementation *const *
|
|
available_implementation_list::end() const noexcept {
|
|
return internal::get_available_implementation_pointers().end();
|
|
}
|
|
const implementation *
|
|
available_implementation_list::detect_best_supported() const noexcept {
|
|
// They are prelisted in priority order, so we just go down the list
|
|
uint32_t supported_instruction_sets =
|
|
internal::detect_supported_architectures();
|
|
for (const implementation *impl :
|
|
internal::get_available_implementation_pointers()) {
|
|
uint32_t required_instruction_sets = impl->required_instruction_sets();
|
|
if ((supported_instruction_sets & required_instruction_sets) ==
|
|
required_instruction_sets) {
|
|
return impl;
|
|
}
|
|
}
|
|
return get_unsupported_singleton(); // this should never happen?
|
|
}
|
|
|
|
const implementation *
|
|
detect_best_supported_implementation_on_first_use::set_best() const noexcept {
|
|
SIMDUTF_PUSH_DISABLE_WARNINGS
|
|
SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
|
|
// manually verified this is safe
|
|
char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
|
|
if (force_implementation_name) {
|
|
auto force_implementation =
|
|
get_available_implementations()[force_implementation_name];
|
|
if (force_implementation) {
|
|
return get_active_implementation() = force_implementation;
|
|
} else {
|
|
// Note: abort() and stderr usage within the library is forbidden.
|
|
return get_active_implementation() = get_unsupported_singleton();
|
|
}
|
|
}
|
|
return get_active_implementation() =
|
|
get_available_implementations().detect_best_supported();
|
|
}
|
|
|
|
} // namespace internal
|
|
|
|
/**
|
|
* The list of available implementations compiled into simdutf.
|
|
*/
|
|
SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
|
|
get_available_implementations() {
|
|
static const internal::available_implementation_list
|
|
available_implementations{};
|
|
return available_implementations;
|
|
}
|
|
|
|
/**
|
|
* The active implementation.
|
|
*/
|
|
SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
|
|
get_active_implementation() {
|
|
#if SIMDUTF_SINGLE_IMPLEMENTATION
|
|
// skip runtime detection
|
|
static internal::atomic_ptr<const implementation> active_implementation{
|
|
internal::get_single_implementation()};
|
|
return active_implementation;
|
|
#else
|
|
static const internal::detect_best_supported_implementation_on_first_use
|
|
detect_best_supported_implementation_on_first_use_singleton;
|
|
static internal::atomic_ptr<const implementation> active_implementation{
|
|
&detect_best_supported_implementation_on_first_use_singleton};
|
|
return active_implementation;
|
|
#endif
|
|
}
|
|
|
|
#if SIMDUTF_SINGLE_IMPLEMENTATION
|
|
const implementation *get_default_implementation() {
|
|
return internal::get_single_implementation();
|
|
}
|
|
#else
|
|
internal::atomic_ptr<const implementation> &get_default_implementation() {
|
|
return get_active_implementation();
|
|
}
|
|
#endif
|
|
#define SIMDUTF_GET_CURRENT_IMPLEMENTION
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf8(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
|
|
return get_default_implementation()->validate_ascii(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_ascii_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16(
|
|
const char *input, size_t length, char16_t *utf16_output) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf8_to_utf16be(input, length, utf16_output);
|
|
#else
|
|
return convert_utf8_to_utf16le(input, length, utf16_output);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len,
|
|
char *utf8_output) noexcept {
|
|
return get_default_implementation()->convert_latin1_to_utf8(buf, len,
|
|
utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) noexcept {
|
|
return get_default_implementation()->convert_latin1_to_utf16le(buf, len,
|
|
utf16_output);
|
|
}
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) noexcept {
|
|
return get_default_implementation()->convert_latin1_to_utf16be(buf, len,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *latin1_output) noexcept {
|
|
return get_default_implementation()->convert_latin1_to_utf32(buf, len,
|
|
latin1_output);
|
|
}
|
|
simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept {
|
|
return length;
|
|
}
|
|
simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept {
|
|
return length;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_latin1(buf, len,
|
|
latin1_output);
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_latin1_with_errors(
|
|
buf, len, latin1_output);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) noexcept {
|
|
return get_default_implementation()->convert_valid_utf8_to_latin1(
|
|
buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(
|
|
const char *input, size_t length, char16_t *utf16_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_utf16le(input, length,
|
|
utf16_output);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(
|
|
const char *input, size_t length, char16_t *utf16_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_utf16be(input, length,
|
|
utf16_output);
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
|
|
const char *input, size_t length, char16_t *utf16_output) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
|
|
#else
|
|
return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
|
|
const char *input, size_t length, char16_t *utf16_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_utf16le_with_errors(
|
|
input, length, utf16_output);
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
|
|
const char *input, size_t length, char16_t *utf16_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_utf16be_with_errors(
|
|
input, length, utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(
|
|
const char *input, size_t length, char32_t *utf32_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_utf32(input, length,
|
|
utf32_output);
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
|
|
const char *input, size_t length, char32_t *utf32_output) noexcept {
|
|
return get_default_implementation()->convert_utf8_to_utf32_with_errors(
|
|
input, length, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
|
|
size_t len) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return validate_utf16be(buf, len);
|
|
#else
|
|
return validate_utf16le(buf, len);
|
|
#endif
|
|
}
|
|
void to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) noexcept {
|
|
return get_default_implementation()->to_well_formed_utf16be(input, len,
|
|
output);
|
|
}
|
|
void to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) noexcept {
|
|
return get_default_implementation()->to_well_formed_utf16le(input, len,
|
|
output);
|
|
}
|
|
void to_well_formed_utf16(const char16_t *input, size_t len,
|
|
char16_t *output) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
to_well_formed_utf16be(input, len, output);
|
|
#else
|
|
to_well_formed_utf16le(input, len, output);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf16le(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf16be(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
|
|
size_t len) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return validate_utf16be_with_errors(buf, len);
|
|
#else
|
|
return validate_utf16le_with_errors(buf, len);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf16le_with_errors(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf16be_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf32(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->validate_utf32_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
|
|
const char *input, size_t length, char16_t *utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
|
|
#else
|
|
return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
|
|
const char *input, size_t length, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf8_to_utf16le(
|
|
input, length, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
|
|
const char *input, size_t length, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf8_to_utf16be(
|
|
input, length, utf16_buffer);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
|
|
const char *input, size_t length, char32_t *utf32_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf8_to_utf32(
|
|
input, length, utf32_buffer);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *buf,
|
|
size_t len,
|
|
char *utf8_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf8(buf, len, utf8_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf8(buf, len, utf8_buffer);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf16_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_latin1(buf, len, latin1_buffer);
|
|
#else
|
|
return convert_utf16le_to_latin1(buf, len, latin1_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_latin1_to_utf16(
|
|
const char *buf, size_t len, char16_t *utf16_output) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_latin1_to_utf16be(buf, len, utf16_output);
|
|
#else
|
|
return convert_latin1_to_utf16le(buf, len, utf16_output);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16be_to_latin1(buf, len,
|
|
latin1_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16le_to_latin1(buf, len,
|
|
latin1_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf16be_to_latin1(
|
|
buf, len, latin1_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf16le_to_latin1(
|
|
buf, len, latin1_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16le_to_latin1_with_errors(
|
|
buf, len, latin1_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16be_to_latin1_with_errors(
|
|
buf, len, latin1_buffer);
|
|
}
|
|
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept {
|
|
return length;
|
|
}
|
|
simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
|
|
return length;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf,
|
|
size_t len,
|
|
char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16le_to_utf8(buf, len,
|
|
utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf,
|
|
size_t len,
|
|
char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16be_to_utf8(buf, len,
|
|
utf8_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
|
|
#else
|
|
return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16le_to_utf8_with_errors(
|
|
buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16be_to_utf8_with_errors(
|
|
buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
|
|
#else
|
|
return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
|
|
#else
|
|
return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf16le_to_utf8(
|
|
buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf16be_to_utf8(
|
|
buf, len, utf8_buffer);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf,
|
|
size_t len,
|
|
char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf32_to_utf8(buf, len,
|
|
utf8_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf32_to_utf8_with_errors(
|
|
buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len,
|
|
utf8_buffer);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf32_to_utf16be(buf, len, utf16_buffer);
|
|
#else
|
|
return convert_utf32_to_utf16le(buf, len, utf16_buffer);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_utf32_to_latin1(
|
|
const char32_t *input, size_t length, char *latin1_output) noexcept {
|
|
return get_default_implementation()->convert_utf32_to_latin1(input, length,
|
|
latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf32_to_utf16le(buf, len,
|
|
utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf32_to_utf16be(buf, len,
|
|
utf16_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
|
|
#else
|
|
return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf32_to_utf16le_with_errors(
|
|
buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf32_to_utf16be_with_errors(
|
|
buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
|
|
#else
|
|
return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf32_to_utf16le(
|
|
buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf32_to_utf16be(
|
|
buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf32(buf, len, utf32_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf32(buf, len, utf32_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16le_to_utf32(buf, len,
|
|
utf32_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16be_to_utf32(buf, len,
|
|
utf32_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16le_to_utf32_with_errors(
|
|
buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
return get_default_implementation()->convert_utf16be_to_utf32_with_errors(
|
|
buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
|
|
#else
|
|
return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf16le_to_utf32(
|
|
buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
|
|
return get_default_implementation()->convert_valid_utf16be_to_utf32(
|
|
buf, len, utf32_buffer);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void change_endianness_utf16(const char16_t *input, size_t length,
|
|
char16_t *output) noexcept {
|
|
get_default_implementation()->change_endianness_utf16(input, length, output);
|
|
}
|
|
simdutf_warn_unused size_t count_utf16(const char16_t *input,
|
|
size_t length) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return count_utf16be(input, length);
|
|
#else
|
|
return count_utf16le(input, length);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->count_utf16le(input, length);
|
|
}
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->count_utf16be(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t count_utf8(const char *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->count_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->latin1_length_from_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
|
|
size_t len) noexcept {
|
|
return get_default_implementation()->utf8_length_from_latin1(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
|
|
size_t length) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return utf8_length_from_utf16be(input, length);
|
|
#else
|
|
return utf8_length_from_utf16le(input, length);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf8_length_from_utf16le(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf8_length_from_utf16be(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
|
|
size_t length) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return utf32_length_from_utf16be(input, length);
|
|
#else
|
|
return utf32_length_from_utf16le(input, length);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf32_length_from_utf16le(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf32_length_from_utf16be(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf16_length_from_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf16_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->utf32_length_from_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused size_t
|
|
maximal_binary_length_from_base64(const char *input, size_t length) noexcept {
|
|
return get_default_implementation()->maximal_binary_length_from_base64(
|
|
input, length);
|
|
}
|
|
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options) noexcept {
|
|
return get_default_implementation()->base64_to_binary(
|
|
input, length, output, options, last_chunk_handling_options);
|
|
}
|
|
|
|
simdutf_warn_unused size_t maximal_binary_length_from_base64(
|
|
const char16_t *input, size_t length) noexcept {
|
|
return get_default_implementation()->maximal_binary_length_from_base64(
|
|
input, length);
|
|
}
|
|
|
|
simdutf_warn_unused result base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options) noexcept {
|
|
return get_default_implementation()->base64_to_binary(
|
|
input, length, output, options, last_chunk_handling_options);
|
|
}
|
|
|
|
template <typename chartype>
|
|
simdutf_warn_unused result base64_to_binary_safe_impl(
|
|
const chartype *input, size_t length, char *output, size_t &outlen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options) noexcept {
|
|
static_assert(std::is_same<chartype, char>::value ||
|
|
std::is_same<chartype, char16_t>::value,
|
|
"Only char and char16_t are supported.");
|
|
// The implementation could be nicer, but we expect that most times, the user
|
|
// will provide us with a buffer that is large enough.
|
|
size_t max_length = maximal_binary_length_from_base64(input, length);
|
|
if (outlen >= max_length) {
|
|
// fast path
|
|
full_result r = get_default_implementation()->base64_to_binary_details(
|
|
input, length, output, options, last_chunk_handling_options);
|
|
if (r.error != error_code::INVALID_BASE64_CHARACTER &&
|
|
r.error != error_code::BASE64_EXTRA_BITS) {
|
|
outlen = r.output_count;
|
|
if (last_chunk_handling_options == stop_before_partial) {
|
|
if ((r.output_count % 3) != 0) {
|
|
bool empty_trail = true;
|
|
for (size_t i = r.input_count; i < length; i++) {
|
|
if (!scalar::base64::is_ascii_white_space_or_padding(input[i])) {
|
|
empty_trail = false;
|
|
break;
|
|
}
|
|
}
|
|
if (empty_trail) {
|
|
r.input_count = length;
|
|
}
|
|
}
|
|
return {r.error, r.input_count};
|
|
}
|
|
return {r.error, length};
|
|
}
|
|
return r;
|
|
}
|
|
// The output buffer is maybe too small. We will decode a truncated version of
|
|
// the input.
|
|
size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
|
|
size_t safe_input = base64_length_from_binary(outlen3, options);
|
|
full_result r = get_default_implementation()->base64_to_binary_details(
|
|
input, safe_input, output, options, loose);
|
|
if (r.error == error_code::INVALID_BASE64_CHARACTER) {
|
|
return r;
|
|
}
|
|
size_t offset =
|
|
(r.error == error_code::BASE64_INPUT_REMAINDER)
|
|
? 1
|
|
: ((r.output_count % 3) == 0 ? 0 : (r.output_count % 3) + 1);
|
|
size_t output_index = r.output_count - (r.output_count % 3);
|
|
size_t input_index = safe_input;
|
|
// offset is a value that is no larger than 3. We backtrack
|
|
// by up to offset characters + an undetermined number of
|
|
// white space characters. It is expected that the next loop
|
|
// runs at most 3 times + the number of white space characters
|
|
// in between them, so we are not worried about performance.
|
|
while (offset > 0 && input_index > 0) {
|
|
chartype c = input[--input_index];
|
|
if (scalar::base64::is_ascii_white_space(c)) {
|
|
// skipping
|
|
} else {
|
|
offset--;
|
|
}
|
|
}
|
|
size_t remaining_out = outlen - output_index;
|
|
const chartype *tail_input = input + input_index;
|
|
size_t tail_length = length - input_index;
|
|
while (tail_length > 0 &&
|
|
scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
|
|
tail_length--;
|
|
}
|
|
size_t padding_characts = 0;
|
|
if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
|
|
tail_length--;
|
|
padding_characts++;
|
|
while (tail_length > 0 &&
|
|
scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
|
|
tail_length--;
|
|
}
|
|
if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
|
|
tail_length--;
|
|
padding_characts++;
|
|
}
|
|
}
|
|
// this will advance tail_input and tail_length
|
|
result rr = scalar::base64::base64_tail_decode_safe(
|
|
output + output_index, remaining_out, tail_input, tail_length,
|
|
padding_characts, options, last_chunk_handling_options);
|
|
outlen = output_index + remaining_out;
|
|
if (last_chunk_handling_options != stop_before_partial &&
|
|
rr.error == error_code::SUCCESS && padding_characts > 0) {
|
|
// additional checks
|
|
if ((outlen % 3 == 0) || ((outlen % 3) + 1 + padding_characts != 4)) {
|
|
rr.error = error_code::INVALID_BASE64_CHARACTER;
|
|
}
|
|
}
|
|
if (rr.error == error_code::SUCCESS &&
|
|
last_chunk_handling_options == stop_before_partial) {
|
|
if (tail_input > input + input_index) {
|
|
rr.count = tail_input - input;
|
|
} else if (r.input_count > 0) {
|
|
rr.count = r.input_count + rr.count;
|
|
}
|
|
return rr;
|
|
}
|
|
rr.count += input_index;
|
|
return rr;
|
|
}
|
|
|
|
#if SIMDUTF_ATOMIC_REF
|
|
size_t atomic_binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) noexcept {
|
|
static_assert(std::atomic_ref<char>::required_alignment == 1);
|
|
size_t retval = 0;
|
|
// Arbitrary block sizes: 3KB for input, 4KB for output. Total is 7KB.
|
|
constexpr size_t input_block_size = 1024 * 3;
|
|
constexpr size_t output_block_size = input_block_size * 4 / 3;
|
|
std::array<char, input_block_size> inbuf;
|
|
std::array<char, output_block_size> outbuf;
|
|
|
|
// std::atomic_ref<T> must not have a const T, see
|
|
// https://cplusplus.github.io/LWG/issue3508
|
|
// we instead provide a mutable input, which is ok since we are only reading
|
|
// from it.
|
|
char *mutable_input = const_cast<char *>(input);
|
|
|
|
for (size_t i = 0; i < length; i += input_block_size) {
|
|
const size_t current_block_size = std::min(input_block_size, length - i);
|
|
// This copy is inefficient.
|
|
// Under x64, we could use 16-byte aligned loads.
|
|
// Note that we warn users that the performance might be poor.
|
|
for (size_t j = 0; j < current_block_size; ++j) {
|
|
inbuf[j] = std::atomic_ref<char>(mutable_input[i + j])
|
|
.load(std::memory_order_relaxed);
|
|
}
|
|
const size_t written = binary_to_base64(inbuf.data(), current_block_size,
|
|
outbuf.data(), options);
|
|
// This copy is inefficient.
|
|
// Under x64, we could use 16-byte aligned stores.
|
|
for (size_t j = 0; j < written; ++j) {
|
|
std::atomic_ref<char>(output[retval + j])
|
|
.store(outbuf[j], std::memory_order_relaxed);
|
|
}
|
|
retval += written;
|
|
}
|
|
return retval;
|
|
}
|
|
#endif // SIMDUTF_ATOMIC_REF
|
|
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
|
|
const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept {
|
|
const auto start{utf8_output};
|
|
|
|
while (true) {
|
|
// convert_latin1_to_utf8 will never write more than input length * 2
|
|
auto read_len = std::min(len, utf8_len >> 1);
|
|
if (read_len <= 16) {
|
|
break;
|
|
}
|
|
|
|
const auto write_len =
|
|
simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);
|
|
|
|
utf8_output += write_len;
|
|
utf8_len -= write_len;
|
|
buf += read_len;
|
|
len -= read_len;
|
|
}
|
|
|
|
utf8_output +=
|
|
scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);
|
|
|
|
return utf8_output - start;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result base64_to_binary_safe(
|
|
const char *input, size_t length, char *output, size_t &outlen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options) noexcept {
|
|
return base64_to_binary_safe_impl<char>(input, length, output, outlen,
|
|
options, last_chunk_handling_options);
|
|
}
|
|
simdutf_warn_unused result base64_to_binary_safe(
|
|
const char16_t *input, size_t length, char *output, size_t &outlen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_handling_options) noexcept {
|
|
return base64_to_binary_safe_impl<char16_t>(
|
|
input, length, output, outlen, options, last_chunk_handling_options);
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
base64_length_from_binary(size_t length, base64_options options) noexcept {
|
|
return get_default_implementation()->base64_length_from_binary(length,
|
|
options);
|
|
}
|
|
|
|
size_t binary_to_base64(const char *input, size_t length, char *output,
|
|
base64_options options) noexcept {
|
|
return get_default_implementation()->binary_to_base64(input, length, output,
|
|
options);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused simdutf::encoding_type
|
|
autodetect_encoding(const char *buf, size_t length) noexcept {
|
|
return get_default_implementation()->autodetect_encoding(buf, length);
|
|
}
|
|
|
|
simdutf_warn_unused int detect_encodings(const char *buf,
|
|
size_t length) noexcept {
|
|
return get_default_implementation()->detect_encodings(buf, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
const implementation *builtin_implementation() {
|
|
static const implementation *builtin_impl =
|
|
get_available_implementations()[SIMDUTF_STRINGIFY(
|
|
SIMDUTF_BUILTIN_IMPLEMENTATION)];
|
|
return builtin_impl;
|
|
}
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
|
|
return scalar::utf8::trim_partial_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
|
|
size_t length) {
|
|
return scalar::utf16::trim_partial_utf16<BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
|
|
size_t length) {
|
|
return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
|
|
size_t length) {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return trim_partial_utf16be(input, length);
|
|
#else
|
|
return trim_partial_utf16le(input, length);
|
|
#endif
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
} // namespace simdutf
|
|
/* end file src/implementation.cpp */
|
|
|
|
SIMDUTF_PUSH_DISABLE_WARNINGS
|
|
SIMDUTF_DISABLE_UNDESIRED_WARNINGS
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
/* begin file src/arm64/implementation.cpp */
|
|
/* begin file src/simdutf/arm64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "arm64"
|
|
// #define SIMDUTF_IMPLEMENTATION arm64
|
|
#define SIMDUTF_SIMD_HAS_BYTEMASK 1
|
|
/* end file src/simdutf/arm64/begin.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
#ifndef SIMDUTF_ARM64_H
|
|
#error "arm64.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \
|
|
SIMDUTF_FEATURE_UTF8
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
|
|
simd8<uint8_t> bits = input.reduce_or();
|
|
return bits.max_val() < 0b10000000u;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING ||
|
|
// SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_really_inline simd8<bool>
|
|
must_be_2_3_continuation(const simd8<uint8_t> prev2,
|
|
const simd8<uint8_t> prev3) {
|
|
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
|
|
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
|
|
return is_third_byte ^ is_fourth_byte;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32)
|
|
// common functions for utf8 conversions
|
|
simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
|
|
// Low half contains 10cccccc|1110aaaa
|
|
// High half contains 10bbbbbb|10bbbbbb
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1,
|
|
4, 4, 7, 7, 10, 10);
|
|
#else
|
|
const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
|
|
#endif
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
// Split into half vectors.
|
|
// 10cccccc|1110aaaa
|
|
uint8x8_t perm_low = vget_low_u8(perm); // no-op
|
|
// 10bbbbbb|10bbbbbb
|
|
uint8x8_t perm_high = vget_high_u8(perm);
|
|
// xxxxxxxx 10bbbbbb
|
|
uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
|
|
// xxxxxxxx 1110aaaa
|
|
uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
|
|
// Assemble with shift left insert.
|
|
// xxxxxxaa aabbbbbb
|
|
uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
|
|
// (perm_low << 8) | (perm_low >> 8)
|
|
// xxxxxxxx 10cccccc
|
|
uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
|
|
// Shift left insert into the low bits
|
|
// aaaabbbb bbcccccc
|
|
uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
|
|
return composed;
|
|
}
|
|
|
|
simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
|
|
// Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
|
|
// Technically this calculates 8, but 6 does better and happens more often
|
|
// (The languages which use these codepoints use ASCII spaces so 8 would need
|
|
// to be in the middle of a very long word).
|
|
|
|
// 10bbbbbb 110aaaaa
|
|
uint16x8_t upper = vreinterpretq_u16_u8(in);
|
|
// (in << 8) | (in >> 8)
|
|
// 110aaaaa 10bbbbbb
|
|
uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
|
|
// 00000000 000aaaaa
|
|
uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
|
|
// Assemble with shift left insert.
|
|
// 00000aaa aabbbbbb
|
|
uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
|
|
return composed;
|
|
}
|
|
|
|
simdutf_really_inline uint16x8_t
|
|
convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
|
|
// Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
|
|
// This is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes.
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
|
|
// Shuffle
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 110aaaaa 10bbbbbb
|
|
uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
|
|
// Mask
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000000 00bbbbbb
|
|
uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 000aaaaa 00000000
|
|
uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
|
|
// Combine with a shift right accumulate
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000aaa aabbbbbb
|
|
uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
|
|
return composed;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 ||
|
|
// SIMDUTF_FEATURE_UTF32)
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/arm64/arm_utf16fix.cpp */
|
|
|
|
/*
|
|
* Returns if a vector of type uint8x16_t is all zero.
|
|
*/
|
|
simdutf_really_inline int veq_non_zero(uint8x16_t v) {
|
|
// might compile to two instructions:
|
|
// umaxv s0, v0.4s
|
|
// fmov w0, s0
|
|
// On Apple hardware, they both have a latency of 3 cycles, with a throughput
|
|
// of four instructions per cycle. So that's 6 cycles of latency (!!!) for the
|
|
// two instructions. A narrowing shift has the same latency and throughput.
|
|
return vmaxvq_u32(vreinterpretq_u32_u8(v));
|
|
}
|
|
|
|
/*
|
|
* Process one block of 16 characters. If in_place is false,
|
|
* copy the block from in to out. If there is a sequencing
|
|
* error in the block, overwrite the illsequenced characters
|
|
* with the replacement character. This function reads one
|
|
* character before the beginning of the buffer as a lookback.
|
|
* If that character is illsequenced, it too is overwritten.
|
|
*/
|
|
template <endianness big_endian, bool inplace>
|
|
void utf16fix_block(char16_t *out, const char16_t *in) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
uint8x16x2_t lb, block;
|
|
uint8x16_t lb_masked, block_masked, lb_is_high, block_is_low;
|
|
uint8x16_t illseq;
|
|
|
|
const int idx = !match_system(big_endian) ? 0 : 1;
|
|
|
|
/* TODO: compute lookback using shifts */
|
|
lb = vld2q_u8((const uint8_t *)(in - 1));
|
|
block = vld2q_u8((const uint8_t *)in);
|
|
lb_masked = vandq_u8(lb.val[idx], vdupq_n_u8(0xfc));
|
|
block_masked = vandq_u8(block.val[idx], vdupq_n_u8(0xfc));
|
|
lb_is_high = vceqq_u8(lb_masked, vdupq_n_u8(0xd8));
|
|
block_is_low = vceqq_u8(block_masked, vdupq_n_u8(0xdc));
|
|
|
|
illseq = veorq_u8(lb_is_high, block_is_low);
|
|
if (veq_non_zero(illseq)) {
|
|
uint8x16_t lb_illseq, block_illseq;
|
|
char16_t lbc;
|
|
int ill;
|
|
|
|
/* compute the cause of the illegal sequencing */
|
|
lb_illseq = vbicq_u8(lb_is_high, block_is_low);
|
|
block_illseq = vorrq_u8(vbicq_u8(block_is_low, lb_is_high),
|
|
vextq_u8(lb_illseq, vdupq_n_u8(0), 1));
|
|
|
|
/* fix illegal sequencing in the lookback */
|
|
ill = vgetq_lane_u8(lb_illseq, 0);
|
|
lbc = out[-1];
|
|
out[-1] = ill ? replacement : lbc;
|
|
|
|
/* fix illegal sequencing in the main block */
|
|
if (!match_system(big_endian)) {
|
|
block.val[1] = vbslq_u8(block_illseq, vdupq_n_u8(0xfd), block.val[1]);
|
|
block.val[0] = vorrq_u8(block_illseq, block.val[0]);
|
|
} else {
|
|
block.val[0] = vbslq_u8(block_illseq, vdupq_n_u8(0xfd), block.val[0]);
|
|
block.val[1] = vorrq_u8(block_illseq, block.val[1]);
|
|
}
|
|
|
|
vst2q_u8((uint8_t *)out, block);
|
|
} else if (!inplace) {
|
|
vst2q_u8((uint8_t *)out, block);
|
|
}
|
|
}
|
|
|
|
template <endianness big_endian, bool inplace>
|
|
uint8x16_t get_mismatch_copy(const char16_t *in, char16_t *out) {
|
|
const int idx = !match_system(big_endian) ? 0 : 1;
|
|
uint8x16x2_t lb = vld2q_u8((const uint8_t *)(in - 1));
|
|
uint8x16x2_t block = vld2q_u8((const uint8_t *)in);
|
|
uint8x16_t lb_masked = vandq_u8(lb.val[idx], vdupq_n_u8(0xfc));
|
|
uint8x16_t block_masked = vandq_u8(block.val[idx], vdupq_n_u8(0xfc));
|
|
uint8x16_t lb_is_high = vceqq_u8(lb_masked, vdupq_n_u8(0xd8));
|
|
uint8x16_t block_is_low = vceqq_u8(block_masked, vdupq_n_u8(0xdc));
|
|
uint8x16_t illseq = veorq_u8(lb_is_high, block_is_low);
|
|
if (!inplace) {
|
|
vst2q_u8((uint8_t *)out, block);
|
|
}
|
|
return illseq;
|
|
}
|
|
|
|
simdutf_really_inline uint64_t get_mask(uint8x16_t illse0, uint8x16_t illse1,
|
|
uint8x16_t illse2, uint8x16_t illse3) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
uint8x16_t bit_mask =
|
|
simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
uint8x16_t sum0 =
|
|
vpaddq_u8(vandq_u8(illse0, bit_mask), vandq_u8(illse1, bit_mask));
|
|
uint8x16_t sum1 =
|
|
vpaddq_u8(vandq_u8(illse2, bit_mask), vandq_u8(illse3, bit_mask));
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
|
|
// The idea is to process 64 characters at a time, and if there is a mismatch
|
|
// we can fix it with a bit of scalar code. When the input is correct, this
|
|
// function might be faster than alternative implementations working on small
|
|
// blocks of input.
|
|
template <endianness big_endian, bool inplace>
|
|
bool utf16fix_block64(char16_t *out, const char16_t *in) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
|
|
uint8x16_t illse0 = inplace ? get_mismatch_copy<big_endian, true>(in, out)
|
|
: get_mismatch_copy<big_endian, false>(in, out);
|
|
uint8x16_t illse1 =
|
|
inplace ? get_mismatch_copy<big_endian, true>(in + 16, out + 16)
|
|
: get_mismatch_copy<big_endian, false>(in + 16, out + 16);
|
|
uint8x16_t illse2 =
|
|
inplace ? get_mismatch_copy<big_endian, true>(in + 32, out + 32)
|
|
: get_mismatch_copy<big_endian, false>(in + 32, out + 32);
|
|
uint8x16_t illse3 =
|
|
inplace ? get_mismatch_copy<big_endian, true>(in + 48, out + 48)
|
|
: get_mismatch_copy<big_endian, false>(in + 48, out + 48);
|
|
// this branch could be marked as unlikely:
|
|
if (veq_non_zero(
|
|
vorrq_u8(vorrq_u8(illse0, illse1), vorrq_u8(illse2, illse3)))) {
|
|
uint64_t matches = get_mask(illse0, illse1, illse2, illse3);
|
|
// Given that ARM has a fast bitreverse instruction, we can
|
|
// reverse once and then use clz to find the first bit set.
|
|
// It is how it is done in simdjson and *might* be beneficial.
|
|
//
|
|
// We might also proceed in reverse to reduce the RAW hazard,
|
|
// but it might require more instructions.
|
|
|
|
while (matches != 0) {
|
|
int r = trailing_zeroes(matches); // generates rbit + clz
|
|
// Either we have a high surrogate followed by a non-low surrogate
|
|
// or we have a low surrogate not preceded by a high surrogate.
|
|
bool is_high = scalar::utf16::is_high_surrogate<big_endian>(in[r - 1]);
|
|
out[r - is_high] = replacement;
|
|
matches = clear_least_significant_bit(matches);
|
|
}
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
void utf16fix_neon_64bits(const char16_t *in, size_t n, char16_t *out) {
|
|
size_t i;
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
if (n < 17) {
|
|
return scalar::utf16::to_well_formed_utf16<big_endian>(in, n, out);
|
|
}
|
|
out[0] =
|
|
scalar::utf16::is_low_surrogate<big_endian>(in[0]) ? replacement : in[0];
|
|
i = 1;
|
|
|
|
/* duplicate code to have the compiler specialise utf16fix_block() */
|
|
if (in == out) {
|
|
for (i = 1; i + 64 < n; i += 64) {
|
|
utf16fix_block64<big_endian, true>(out + i, in + i);
|
|
}
|
|
|
|
for (; i + 16 < n; i += 16) {
|
|
utf16fix_block<big_endian, true>(out + i, in + i);
|
|
}
|
|
|
|
/* tbd: find carry */
|
|
utf16fix_block<big_endian, true>(out + n - 16, in + n - 16);
|
|
} else {
|
|
for (i = 1; i + 64 < n; i += 64) {
|
|
utf16fix_block64<big_endian, false>(out + i, in + i);
|
|
}
|
|
for (; i + 16 < n; i += 16) {
|
|
utf16fix_block<big_endian, false>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block<big_endian, false>(out + n - 16, in + n - 16);
|
|
}
|
|
out[n - 1] = scalar::utf16::is_high_surrogate<big_endian>(out[n - 1])
|
|
? replacement
|
|
: out[n - 1];
|
|
}
|
|
/* end file src/arm64/arm_utf16fix.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/arm64/arm_validate_utf16.cpp */
|
|
template <endianness big_endian>
|
|
const char16_t *arm_validate_utf16(const char16_t *input, size_t size) {
|
|
const char16_t *end = input + size;
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
while (end - input >= 16) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 =
|
|
simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
if (!match_system(big_endian)) {
|
|
in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
|
|
in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
|
|
}
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
|
|
if (surrogates_wordmask == 0) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate code units
|
|
// V = not surrogates_wordmask
|
|
const uint64_t V = ~surrogates_wordmask;
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = ((in & v_fc) == v_dc);
|
|
const uint64_t H = vH.to_bitmask64();
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint64_t L = ~H & surrogates_wordmask;
|
|
|
|
const uint64_t a =
|
|
L & (H >> 4); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint64_t b =
|
|
a << 4; // Just mark that the opposite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint64_t c = V | a | b; // Combine all the masks into the final one.
|
|
if (c == ~0ull) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single code units or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0xfffffffffffffffull) {
|
|
// The 15 lower code units of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
return input;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
const result arm_validate_utf16_with_errors(const char16_t *input,
|
|
size_t size) {
|
|
const char16_t *start = input;
|
|
const char16_t *end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
while (input + 16 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 =
|
|
simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
if (!match_system(big_endian)) {
|
|
in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
|
|
in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
|
|
}
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
|
|
if (surrogates_wordmask == 0) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate code units
|
|
// V = not surrogates_wordmask
|
|
const uint64_t V = ~surrogates_wordmask;
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = ((in & v_fc) == v_dc);
|
|
const uint64_t H = vH.to_bitmask64();
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint64_t L = ~H & surrogates_wordmask;
|
|
|
|
const uint64_t a =
|
|
L & (H >> 4); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint64_t b =
|
|
a << 4; // Just mark that the opposite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint64_t c = V | a | b; // Combine all the masks into the final one.
|
|
if (c == ~0ull) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single code units or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0xfffffffffffffffull) {
|
|
// The 15 lower code units of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/arm64/arm_validate_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/arm64/arm_validate_utf32le.cpp */
|
|
|
|
const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) {
|
|
const char32_t *end = input + size;
|
|
|
|
const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
|
|
const uint32x4_t offset = vmovq_n_u32(0xffff2000);
|
|
const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
|
|
uint32x4_t currentmax = vmovq_n_u32(0x0);
|
|
uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
|
|
|
|
while (end - input >= 4) {
|
|
const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
|
|
currentmax = vmaxq_u32(in, currentmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
|
|
input += 4;
|
|
}
|
|
|
|
uint32x4_t is_zero =
|
|
veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
|
|
if (vmaxvq_u32(is_zero) != 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
|
|
standardoffsetmax);
|
|
if (vmaxvq_u32(is_zero) != 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
return input;
|
|
}
|
|
|
|
const result arm_validate_utf32le_with_errors(const char32_t *input,
|
|
size_t size) {
|
|
const char32_t *start = input;
|
|
const char32_t *end = input + size;
|
|
|
|
const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
|
|
const uint32x4_t offset = vmovq_n_u32(0xffff2000);
|
|
const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
|
|
uint32x4_t currentmax = vmovq_n_u32(0x0);
|
|
uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
|
|
|
|
while (end - input >= 4) {
|
|
const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
|
|
currentmax = vmaxq_u32(in, currentmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
|
|
|
|
uint32x4_t is_zero =
|
|
veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
|
|
if (vmaxvq_u32(is_zero) != 0) {
|
|
return result(error_code::TOO_LARGE, input - start);
|
|
}
|
|
|
|
is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
|
|
standardoffsetmax);
|
|
if (vmaxvq_u32(is_zero) != 0) {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
|
|
input += 4;
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/arm64/arm_validate_utf32le.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char *, char16_t *>
|
|
arm_convert_latin1_to_utf16(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char *end = buf + len;
|
|
|
|
while (end - buf >= 16) {
|
|
uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
|
|
uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
|
|
if (!match_system(big_endian)) {
|
|
inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow)));
|
|
}
|
|
vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), inlow);
|
|
uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8));
|
|
if (!match_system(big_endian)) {
|
|
inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh)));
|
|
}
|
|
vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output + 8), inhigh);
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */
|
|
std::pair<const char *, char32_t *>
|
|
arm_convert_latin1_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char *end = buf + len;
|
|
|
|
while (end - buf >= 16) {
|
|
uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
|
|
uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
|
|
uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
|
|
uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
|
|
uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
|
|
uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
|
|
uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
|
|
vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
|
|
vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 4), in16lowhigh);
|
|
vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 8), in8highlow);
|
|
vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 12), in8highhigh);
|
|
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
std::pair<const char *, char *>
|
|
arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char *end = latin1_input + len;
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
// We always write 16 bytes, of which more than the first 8 bytes
|
|
// are valid. A safety margin of 8 is more than sufficient.
|
|
while (end - latin1_input >= 16 + 8) {
|
|
uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
|
|
if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
|
|
vst1q_u8(utf8_output, in8);
|
|
utf8_output += 16;
|
|
latin1_input += 16;
|
|
continue;
|
|
}
|
|
|
|
// We just fallback on UTF-16 code. This could be optimized/simplified
|
|
// further.
|
|
uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
|
|
// 1. prepare 2-byte values
|
|
// input 8-bit word : [aabb|bbbb] x 8
|
|
// expected output : [1100|00aa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [0000|00aa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(in16, 2);
|
|
// t1 = [0000|00aa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(in16, v_003f);
|
|
// t3 = [0000|00aa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [1100|00aa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
|
|
const uint8x16_t utf8_unpacked =
|
|
vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0002, 0x0008, 0x0020, 0x0080};
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
// 6. adjust pointers
|
|
latin1_input += 8;
|
|
utf8_output += row[0];
|
|
|
|
} // while
|
|
|
|
return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */
|
|
// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 16, usually 12).
|
|
size_t convert_masked_utf8_to_latin1(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char *&latin1_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
|
|
// We first try a few fast paths.
|
|
// The obvious first test is ASCII, which actually consumes the full 16.
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process in chunks of 12 bytes
|
|
vst1q_u8(reinterpret_cast<uint8_t *>(latin1_output), in);
|
|
latin1_output += 12; // We wrote 12 18-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
/// We do not have a fast path available, or the fast path is unimportant, so
|
|
/// we fallback.
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
// this indicates an invalid input:
|
|
if (idx >= 64) {
|
|
return consumed;
|
|
}
|
|
// Here we should have (idx < 64), if not, there is a bug in the validation or
|
|
// elsewhere. SIX (6) input code-code units this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
|
|
// 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
|
|
// scenario we process SIX (6) input code-code units. The max length in bytes
|
|
// of six code code units spanning between 1 and 2 bytes each is 12 bytes.
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
// Shuffle
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 110aaaaa 10bbbbbb
|
|
uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
|
|
// Mask
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000000 00bbbbbb
|
|
uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 000aaaaa 00000000
|
|
uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
|
|
// Combine with a shift right accumulate
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000aaa aabbbbbb
|
|
uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
|
|
// writing 8 bytes even though we only care about the first 6 bytes.
|
|
uint8x8_t latin1_packed = vmovn_u16(composed);
|
|
vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
|
|
latin1_output += 6; // We wrote 6 bytes.
|
|
return consumed;
|
|
}
|
|
/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
|
|
// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 16, usually 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
|
|
// We first try a few fast paths.
|
|
// The obvious first test is ASCII, which actually consumes the full 16.
|
|
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
|
|
// We process in chunks of 16 bytes
|
|
// The routine in simd.h is reused.
|
|
simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
|
|
temp.store_ascii_as_utf16<big_endian>(utf16_output);
|
|
utf16_output += 16; // We wrote 16 16-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
|
|
// 3 byte sequences are the next most common, as seen in CJK, which has long
|
|
// sequences of these.
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
|
|
// UTF-16 code units.
|
|
uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
|
|
}
|
|
vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
|
|
utf16_output += 4; // We wrote 4 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
|
|
// 2 byte sequences occur in short bursts in languages like Greek and Russian.
|
|
if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
|
|
// We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
|
|
// UTF-16 code units.
|
|
uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed =
|
|
vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
|
|
}
|
|
vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
|
|
|
|
utf16_output += 6; // We wrote 6 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
|
|
/// We do not have a fast path available, or the fast path is unimportant, so
|
|
/// we fallback.
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// Convert to UTF-16
|
|
uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed =
|
|
vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
|
|
}
|
|
// Store
|
|
vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
|
|
utf16_output += 6; // We wrote 6 16-bit characters.
|
|
return consumed;
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
// UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
// XXX: depending on the system scalar instructions might be faster.
|
|
// 1 byte: 00000000 00000000 0ccccccc
|
|
// 2 byte: 00000000 110bbbbb 10cccccc
|
|
// 3 byte: 1110aaaa 10bbbbbb 10cccccc
|
|
uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2 byte: xx0bbbbb x0cccccc
|
|
// 3 byte: xxbbbbbb x0cccccc
|
|
uint16x4_t lowperm = vmovn_u32(perm);
|
|
// Partially mask with bic (doesn't require a temporary register unlike and)
|
|
// The shift left insert below will clear the top bits.
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: xx0bbbbb 00000000
|
|
// 3 byte: xxbbbbbb 00000000
|
|
uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
|
|
// ASCII
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2+byte: 00000000 00cccccc
|
|
uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
|
|
// Split into narrow vectors.
|
|
// 2 byte: 00000000 00000000
|
|
// 3 byte: 00000000 xxxxaaaa
|
|
uint16x4_t highperm = vshrn_n_u32(perm, 16);
|
|
// Shift right accumulate the middle byte
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2 byte: 00xx0bbb bbcccccc
|
|
// 3 byte: 00xxbbbb bbcccccc
|
|
uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
|
|
// Shift left and insert the top 4 bits, overwriting the garbage
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2 byte: 00000bbb bbcccccc
|
|
// 3 byte: aaaabbbb bbcccccc
|
|
uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
|
|
}
|
|
vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
|
|
|
|
utf16_output += 4; // We wrote 4 16-bit codepoints
|
|
return consumed;
|
|
} else if (idx < 209) {
|
|
// THREE (3) input code-code units
|
|
if (input_utf8_end_of_code_point_mask == 0x888) {
|
|
// We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
|
|
// UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
|
|
// it is easier when we can assume they are all pairs. This version does
|
|
// not use the LUT, but 4 byte sequences are less common and the overhead
|
|
// of the extra memory access is less important than the early branch
|
|
// overhead in shorter sequences.
|
|
|
|
// Swap byte pairs
|
|
// 10dddddd 10cccccc|10bbbbbb 11110aaa
|
|
// 10cccccc 10dddddd|11110aaa 10bbbbbb
|
|
uint8x16_t swap = vrev16q_u8(in);
|
|
// Shift left 2 bits
|
|
// cccccc00 dddddd00 xxxxxxxx bbbbbb00
|
|
uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
|
|
// Create a magic number containing the low 2 bits of the trail surrogate
|
|
// and all the corrections needed to create the pair. UTF-8 4b prefix =
|
|
// -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6)
|
|
// surrogate high = +0x0000|0xD800
|
|
// surrogate low = +0xDC00|0x0000
|
|
// -------------------------------
|
|
// = +0xDC00|0xE7C0
|
|
uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
|
|
// Generate unadjusted trail surrogate minus lowest 2 bits
|
|
// xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
|
|
uint32x4_t trail =
|
|
vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
|
|
// Insert low 2 bits of trail surrogate to magic number for later
|
|
// 11011100 00000000 11100111 110000cc
|
|
uint16x8_t magic_with_low_2 =
|
|
vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
|
|
// Generate lead surrogate
|
|
// xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
|
|
uint32x4_t lead = vreinterpretq_u32_u16(
|
|
vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
|
|
// Mask out lead
|
|
// 000000cc ccdddddd|xxxxxxxx xxxxxxxx
|
|
lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
|
|
// Blend pairs
|
|
// 000000cc ccdddddd|11110aaa bbbbbb00
|
|
uint16x8_t blend = vreinterpretq_u16_u32(
|
|
vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
|
|
// Add magic number to finish the result
|
|
// 110111CC CCDDDDDD|110110AA BBBBBBCC
|
|
uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed =
|
|
vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
|
|
}
|
|
uint16_t buffer[8];
|
|
vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
|
|
for (int k = 0; k < 6; k++) {
|
|
utf16_output[k] = buffer[k];
|
|
} // the loop might compiler to a couple of instructions.
|
|
// We need some validation. See
|
|
// https://github.com/simdutf/simdutf/pull/631
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
uint8x16_t expected_mask = simdutf_make_uint8x16_t(
|
|
0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0,
|
|
0xc0, 0x0, 0x0, 0x0, 0x0);
|
|
#else
|
|
uint8x16_t expected_mask = {0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0,
|
|
0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0,
|
|
0x0, 0x0, 0x0, 0x0};
|
|
#endif
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
uint8x16_t expected = simdutf_make_uint8x16_t(
|
|
0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80,
|
|
0x80, 0x0, 0x0, 0x0, 0x0);
|
|
#else
|
|
uint8x16_t expected = {0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80,
|
|
0xf0, 0x80, 0x80, 0x80, 0x0, 0x0, 0x0, 0x0};
|
|
#endif
|
|
uint8x16_t check = vceqq_u8(vandq_u8(in, expected_mask), expected);
|
|
bool correct = (vminvq_u32(vreinterpretq_u32_u8(check)) == 0xFFFFFFFF);
|
|
// The validation is just three instructions and it is not on a critical
|
|
// path.
|
|
if (correct) {
|
|
utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
|
|
}
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// 3 1-4 byte sequences
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
|
|
// 1 byte: 00000000 00000000 00000000 0ddddddd
|
|
// 3 byte: 00000000 00000000 110ccccc 10dddddd
|
|
// 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
|
|
// 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
|
|
uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
|
|
// added to fix issue https://github.com/simdutf/simdutf/issues/514
|
|
// We only want to write 2 * 16-bit code units when that is actually what we
|
|
// have. Unfortunately, we cannot trust the input. So it is possible to get
|
|
// 0xff as an input byte and it should not result in a surrogate pair. We
|
|
// need to check for that.
|
|
uint32_t permbuffer[4];
|
|
vst1q_u32(permbuffer, perm);
|
|
// Mask the low and middle bytes
|
|
// 00000000 00000000 00000000 0ddddddd
|
|
uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
|
|
// Because the surrogates need more work, the high surrogate is computed
|
|
// first.
|
|
uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
|
|
// 00000000 00000000 00cccccc 00000000
|
|
uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
|
|
// Start assembling the sequence. Since the 4th byte is in the same position
|
|
// as it would be in a surrogate and there is no dependency, shift left
|
|
// instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
|
|
// 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
|
|
uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
|
|
// Top 16 bits contains the high ten bits of the surrogate pair before
|
|
// correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
|
|
// bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
|
|
uint32x4_t abc =
|
|
vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
|
|
// Combine the low 6 or 7 bits by a shift right accumulate
|
|
// 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
|
|
// 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
|
|
// correction
|
|
uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
|
|
// After this is for surrogates
|
|
// Blend the low and high surrogates
|
|
// 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
|
|
uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
|
|
// Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
|
|
// yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
|
|
// 11110aaa bbbbbbcc|000000cc ccdddddd
|
|
uint16x8_t masked_pair = vreinterpretq_u16_u32(
|
|
vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
|
|
// Correct the remaining UTF-8 prefix, surrogate offset, and add the
|
|
// surrogate prefixes in one magic 16-bit addition. similar magic number but
|
|
// without the continue byte adjust and halfword swapped UTF-8 4b prefix =
|
|
// -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6)
|
|
// surrogate high = +0xD800|0x0000
|
|
// surrogate low = +0x0000|0xDC00
|
|
// -----------------------------------
|
|
// = +0xE7C0|0xDC00
|
|
uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
|
|
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
|
|
uint32x4_t surrogates =
|
|
vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
|
|
// If the high bit is 1 (s32 less than zero), this needs a surrogate pair
|
|
uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
|
|
|
|
// Select either the 4 byte surrogate pair or the 2 byte solo codepoint
|
|
// 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
|
|
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
|
|
uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
selected =
|
|
vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
|
|
}
|
|
// Attempting to shuffle and store would be complex, just scalarize.
|
|
uint32_t buffer[4];
|
|
vst1q_u32(buffer, selected);
|
|
// Test for the top bit of the surrogate mask. Remove due to issue 514
|
|
// const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
|
|
// 0x00800000;
|
|
for (size_t i = 0; i < 3; i++) {
|
|
// Surrogate
|
|
// Used to be if (buffer[i] & SURROGATE_MASK) {
|
|
// See discussion above.
|
|
// patch for issue https://github.com/simdutf/simdutf/issues/514
|
|
if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
|
|
utf16_output[0] = uint16_t(buffer[i] >> 16);
|
|
utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
return consumed;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
return 12;
|
|
}
|
|
}
|
|
/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_out) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
|
|
uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xFFF;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process in chunks of 12 bytes.
|
|
// use fast implementation in src/simdutf/arm64/simd.h
|
|
// Ideally the compiler can keep the tables in registers.
|
|
simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
|
|
temp.store_ascii_as_utf32_tbl(utf32_out);
|
|
utf32_output += 12; // We wrote 12 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
|
|
// UTF-32 code units. Convert to UTF-16
|
|
uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
|
|
// Zero extend and store via ST2 with a zero.
|
|
uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}};
|
|
vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
|
|
utf32_output += 4; // We wrote 4 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
|
|
// 2 byte sequences occur in short bursts in languages like Greek and Russian.
|
|
if (input_utf8_end_of_code_point_mask == 0xaaa) {
|
|
// We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
|
|
// UTF-32 code units. Convert to UTF-16
|
|
uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
|
|
// Zero extend and store via ST2 with a zero.
|
|
uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
|
|
vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
|
|
utf32_output += 6; // We wrote 6 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
/// Either no fast path or an unimportant fast path.
|
|
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// Convert to UTF-16
|
|
uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
|
|
// Zero extend and store with ST2 and zero
|
|
uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
|
|
vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
|
|
utf32_output += 6; // We wrote 6 32-bit characters.
|
|
return consumed;
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
// UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
// Shuffle
|
|
// 1 byte: 00000000 00000000 0ccccccc
|
|
// 2 byte: 00000000 110bbbbb 10cccccc
|
|
// 3 byte: 1110aaaa 10bbbbbb 10cccccc
|
|
uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
|
|
// Split
|
|
// 00000000 00000000 0ccccccc
|
|
uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits
|
|
// Note: unmasked
|
|
// xxxxxxxx aaaaxxxx xxxxxxxx
|
|
uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits
|
|
// Use 16 bit bic instead of and.
|
|
// The top bits will be corrected later in the bsl
|
|
// 00000000 10bbbbbb 00000000
|
|
uint32x4_t middle = vreinterpretq_u32_u16(
|
|
vbicq_u16(vreinterpretq_u16_u32(perm),
|
|
vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
|
|
// Combine low and middle with shift right accumulate
|
|
// 00000000 00xxbbbb bbcccccc
|
|
uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
|
|
// Insert top 4 bits from high byte with bitwise select
|
|
// 00000000 aaaabbbb bbcccccc
|
|
uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
|
|
vst1q_u32(utf32_output, composed);
|
|
utf32_output += 4; // We wrote 4 32-bit characters.
|
|
return consumed;
|
|
} else if (idx < 209) {
|
|
// THREE (3) input code-code units
|
|
if (input_utf8_end_of_code_point_mask == 0x888) {
|
|
// We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
|
|
// UTF-32 code units. This uses the same method as the fixed 3 byte
|
|
// version, reversing and shift left insert. However, there is no need for
|
|
// a shuffle mask now, just rev16 and rev32.
|
|
//
|
|
// This version does not use the LUT, but 4 byte sequences are less common
|
|
// and the overhead of the extra memory access is less important than the
|
|
// early branch overhead in shorter sequences, so it comes last.
|
|
|
|
// Swap pairs of bytes
|
|
// 10dddddd|10cccccc|10bbbbbb|11110aaa
|
|
// 10cccccc 10dddddd|11110aaa 10bbbbbb
|
|
uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
|
|
// Shift left and insert
|
|
// xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
|
|
uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
|
|
// Swap 16-bit lanes
|
|
// xxxxcccc ccdddddd xxxxxxxa aabbbbbb
|
|
// xxxxxxxa aabbbbbb xxxxcccc ccdddddd
|
|
uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
|
|
// Shift insert again
|
|
// xxxxxxxx xxxaaabb bbbbcccc ccdddddd
|
|
uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
|
|
// Clear the garbage
|
|
// 00000000 000aaabb bbbbcccc ccdddddd
|
|
uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
|
|
// Store
|
|
vst1q_u32(utf32_output, composed);
|
|
|
|
utf32_output += 3; // We wrote 3 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
|
|
// due to surrogates no longer being involved.
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
// 1 byte: 00000000 00000000 00000000 0ddddddd
|
|
// 2 byte: 00000000 00000000 110ccccc 10dddddd
|
|
// 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
|
|
// 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
|
|
uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
|
|
// Ascii
|
|
uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
|
|
uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
|
|
// When converting the way we do, the 3 byte prefix will be interpreted as
|
|
// the 18th bit being set, since the code would interpret the lead byte
|
|
// (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can
|
|
// either xor or do an 8 bit add of the 6th bit shifted right by 1. Since
|
|
// NEON has shift right accumulate, we use that.
|
|
// 4 byte 3 byte
|
|
// 10bbbbbb 1110bbbb
|
|
// 00000000 01000000 6th bit
|
|
// 00000000 00100000 shift right
|
|
// 10bbbbbb 0000bbbb add
|
|
// 00bbbbbb 0000bbbb mask
|
|
uint8x16_t correction =
|
|
vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
|
|
uint32x4_t corrected = vreinterpretq_u32_u8(
|
|
vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
|
|
// 00000000 00000000 0000cccc ccdddddd
|
|
uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
|
|
// Insert twice
|
|
// xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
|
|
uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6),
|
|
vshrq_n_u32(corrected, 4));
|
|
// 00000000 000aaabb bbbbcccc ccdddddd
|
|
uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
|
|
// Store
|
|
vst1q_u32(utf32_output, composed);
|
|
utf32_output += 3; // We wrote 3 32-bit characters.
|
|
return consumed;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
return 12;
|
|
}
|
|
}
|
|
/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */
|
|
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
arm_convert_utf16_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 8) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
|
|
}
|
|
if (vmaxvq_u16(in) <= 0xff) {
|
|
// 1. pack the bytes
|
|
uint8x8_t latin1_packed = vmovn_u16(in);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
arm_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 8) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
|
|
}
|
|
if (vmaxvq_u16(in) <= 0xff) {
|
|
// 1. pack the bytes
|
|
uint8x8_t latin1_packed = vmovn_u16(in);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
for (int k = 0; k < 8; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it is an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char32_t *>
|
|
arm_convert_utf16_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_out) {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
const char16_t *end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
|
|
while (end - buf >= 8) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
|
|
}
|
|
|
|
const uint16x8_t surrogates_bytemask =
|
|
vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: no surrogate pairs, extend all 16-bit code units to 32-bit code
|
|
// units
|
|
vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
|
|
vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t *>
|
|
arm_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
|
|
char32_t *utf32_out) {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
|
|
while ((end - buf) >= 8) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
|
|
}
|
|
|
|
const uint16x8_t surrogates_bytemask =
|
|
vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: no surrogate pairs, extend all 16-bit code units to 32-bit code
|
|
// units
|
|
vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
|
|
vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it is an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
arm_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char16_t *end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
|
|
}
|
|
if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
|
|
// It is common enough that we have sequences of 16 consecutive ASCII
|
|
// characters.
|
|
uint16x8_t nextin =
|
|
vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
|
|
if (!match_system(big_endian)) {
|
|
nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
|
|
}
|
|
if (vmaxvq_u16(nextin) > 0x7F) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(in);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
|
|
// 2. store (16 bytes)
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
if (vmaxvq_u16(in) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
const uint8x16_t utf8_unpacked =
|
|
vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0002, 0x0008, 0x0020, 0x0080};
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
const uint16x8_t surrogates_bytemask =
|
|
vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = simdutf_make_uint16x8_t(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 = vreinterpretq_u16_u8(
|
|
vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
|
|
const uint16x8_t m0 =
|
|
vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
|
|
const uint16x8_t twomask = simdutf_make_uint16x8_t(
|
|
0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
|
|
#else
|
|
const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0100, 0x0400, 0x1000, 0x4000};
|
|
const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
|
|
0x0200, 0x0800, 0x2000, 0x8000};
|
|
#endif
|
|
const uint16x8_t combined =
|
|
vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
|
|
vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
|
|
}
|
|
if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
|
|
// It is common enough that we have sequences of 16 consecutive ASCII
|
|
// characters.
|
|
uint16x8_t nextin =
|
|
vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
|
|
if (!match_system(big_endian)) {
|
|
nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
|
|
}
|
|
if (vmaxvq_u16(nextin) > 0x7F) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(in);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
|
|
// 2. store (16 bytes)
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
if (vmaxvq_u16(in) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
const uint8x16_t utf8_unpacked =
|
|
vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0002, 0x0008, 0x0020, 0x0080};
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
const uint16x8_t surrogates_bytemask =
|
|
vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = simdutf_make_uint16x8_t(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 = vreinterpretq_u16_u8(
|
|
vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
|
|
const uint16x8_t m0 =
|
|
vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
|
|
const uint16x8_t twomask = simdutf_make_uint16x8_t(
|
|
0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
|
|
#else
|
|
const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0100, 0x0400, 0x1000, 0x4000};
|
|
const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
|
|
0x0200, 0x0800, 0x2000, 0x8000};
|
|
#endif
|
|
const uint16x8_t combined =
|
|
vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
|
|
vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t
|
|
arm64_utf8_length_from_utf16_bytemask(const char16_t *in, size_t size) {
|
|
size_t pos = 0;
|
|
|
|
constexpr size_t N = 8;
|
|
const auto one = vmovq_n_u16(1);
|
|
// each char16 yields at least one byte
|
|
size_t count = size / N * N;
|
|
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vld1q_u16(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(input)));
|
|
}
|
|
// 0xd800 .. 0xdbff - low surrogate
|
|
// 0xdc00 .. 0xdfff - high surrogate
|
|
const auto is_surrogate =
|
|
vceqq_u16(vandq_u16(input, vmovq_n_u16(0xf800)), vmovq_n_u16(0xd800));
|
|
|
|
// c0 - chars that yield 2- or 3-byte UTF-8 codes
|
|
const auto c0 = vminq_u16(vandq_u16(input, vmovq_n_u16(0xff80)), one);
|
|
|
|
// c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
|
|
const auto c1 = vminq_u16(vandq_u16(input, vmovq_n_u16(0xf800)), one);
|
|
|
|
/*
|
|
Explanation how the counting works.
|
|
|
|
In the case of a non-surrogate character we count:
|
|
* always 1 -- see how `count` is initialized above;
|
|
* c0 = 1 if the current char yields 2 or 3 bytes;
|
|
* c1 = 1 if the current char yields 3 bytes.
|
|
|
|
Thus, we always have correct count for the current char:
|
|
from 1, 2 or 3 bytes.
|
|
|
|
A trickier part is how we count surrogate pairs. Whether
|
|
we encounter a surrogate (low or high), we count it as
|
|
3 chars and then minus 1 (`is_surrogate` is -1 or 0).
|
|
Each surrogate char yields 2. A surrogate pair, that
|
|
is a low surrogate followed by a high one, yields
|
|
the expected 4 bytes.
|
|
|
|
It also correctly handles cases when low surrogate is
|
|
processed by the this loop, but high surrogate is counted
|
|
by the scalar procedure. The scalar procedure uses exactly
|
|
the described approach, thanks to that for valid UTF-16
|
|
strings it always count correctly.
|
|
*/
|
|
auto v_count = vaddq_u16(c1, c0);
|
|
v_count = vaddq_u16(v_count, is_surrogate);
|
|
count += vaddlvq_u16(v_count);
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/arm64/arm_base64.cpp */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
|
|
size_t encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
// credit: Wojciech Muła
|
|
uint8_t *out = (uint8_t *)dst;
|
|
constexpr static uint8_t source_table[64] = {
|
|
'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
|
|
'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
|
|
'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
|
|
'5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
|
|
'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
|
|
};
|
|
constexpr static uint8_t source_table_url[64] = {
|
|
'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
|
|
'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
|
|
'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
|
|
'5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
|
|
'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
|
|
};
|
|
const uint8x16_t v3f = vdupq_n_u8(0x3f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
// When trying to load a uint8_t array, Visual Studio might
|
|
// error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)':
|
|
// cannot convert argument 1 from 'const uint8_t [64]' to 'const char *
|
|
const uint8x16x4_t table = vld4q_u8(
|
|
(reinterpret_cast<const char *>(options & base64_url) ? source_table_url
|
|
: source_table));
|
|
#else
|
|
const uint8x16x4_t table =
|
|
vld4q_u8((options & base64_url) ? source_table_url : source_table);
|
|
#endif
|
|
size_t i = 0;
|
|
for (; i + 16 * 3 <= srclen; i += 16 * 3) {
|
|
const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
|
|
uint8x16x4_t result;
|
|
result.val[0] = vshrq_n_u8(in.val[0], 2);
|
|
result.val[1] =
|
|
vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
|
|
result.val[2] =
|
|
vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
|
|
result.val[3] = vandq_u8(in.val[2], v3f);
|
|
result.val[0] = vqtbl4q_u8(table, result.val[0]);
|
|
result.val[1] = vqtbl4q_u8(table, result.val[1]);
|
|
result.val[2] = vqtbl4q_u8(table, result.val[2]);
|
|
result.val[3] = vqtbl4q_u8(table, result.val[3]);
|
|
vst4q_u8(out, result);
|
|
out += 64;
|
|
}
|
|
|
|
if (i + 24 <= srclen) {
|
|
const uint8x8_t v3f_d = vdup_n_u8(0x3f);
|
|
const uint8x8x3_t in = vld3_u8((const uint8_t *)src + i);
|
|
uint8x8x4_t result;
|
|
result.val[0] = vshr_n_u8(in.val[0], 2);
|
|
result.val[1] =
|
|
vand_u8(vsli_n_u8(vshr_n_u8(in.val[1], 4), in.val[0], 4), v3f_d);
|
|
result.val[2] =
|
|
vand_u8(vsli_n_u8(vshr_n_u8(in.val[2], 6), in.val[1], 2), v3f_d);
|
|
result.val[3] = vand_u8(in.val[2], v3f_d);
|
|
result.val[0] = vqtbl4_u8(table, result.val[0]);
|
|
result.val[1] = vqtbl4_u8(table, result.val[1]);
|
|
result.val[2] = vqtbl4_u8(table, result.val[2]);
|
|
result.val[3] = vqtbl4_u8(table, result.val[3]);
|
|
vst4_u8(out, result);
|
|
out += 32;
|
|
i += 24;
|
|
}
|
|
|
|
out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
|
|
options);
|
|
|
|
return size_t((char *)out - dst);
|
|
}
|
|
|
|
static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
|
|
if (mask == 0) {
|
|
vst1q_u8((uint8_t *)output, data);
|
|
return;
|
|
}
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
|
|
tables::base64::thintable_epi8[mask2]};
|
|
uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t off =
|
|
simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
|
|
#else
|
|
const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
|
|
#endif
|
|
|
|
compactmask = vaddq_u8(compactmask, off);
|
|
uint8x16_t pruned = vqtbl1q_u8(data, compactmask);
|
|
|
|
int pop1 = tables::base64::BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
|
|
uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
|
|
vst1q_u8((uint8_t *)output, answer);
|
|
}
|
|
|
|
struct block64 {
|
|
uint8x16_t chunks[4];
|
|
};
|
|
|
|
static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
|
|
template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
|
|
uint8x16_t v0f = vdupq_n_u8(0xf);
|
|
|
|
uint8x16_t underscore0, underscore1, underscore2, underscore3;
|
|
if (base64_url) {
|
|
underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
|
|
underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
|
|
underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
|
|
underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
|
|
} else {
|
|
(void)underscore0;
|
|
(void)underscore1;
|
|
(void)underscore2;
|
|
(void)underscore3;
|
|
}
|
|
|
|
uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
|
|
uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
|
|
uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
|
|
uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
|
|
|
|
// Needed by the decoding step.
|
|
uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
|
|
uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
|
|
uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
|
|
uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
|
|
uint8x16_t lut_lo;
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
if (base64_url) {
|
|
lut_lo =
|
|
simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
|
|
0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4);
|
|
} else {
|
|
lut_lo =
|
|
simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
|
|
0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4);
|
|
}
|
|
#else
|
|
if (base64_url) {
|
|
lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
|
|
0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4};
|
|
} else {
|
|
lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
|
|
0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4};
|
|
}
|
|
#endif
|
|
uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
|
|
uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
|
|
uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
|
|
uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
|
|
uint8x16_t lut_hi;
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
if (base64_url) {
|
|
lut_hi =
|
|
simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
|
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
|
|
} else {
|
|
lut_hi =
|
|
simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
|
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
|
|
}
|
|
#else
|
|
if (base64_url) {
|
|
lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
|
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
|
|
} else {
|
|
lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
|
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
|
|
}
|
|
#endif
|
|
uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
|
|
uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
|
|
uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
|
|
uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
|
|
|
|
if (base64_url) {
|
|
hi0 = vbicq_u8(hi0, underscore0);
|
|
hi1 = vbicq_u8(hi1, underscore1);
|
|
hi2 = vbicq_u8(hi2, underscore2);
|
|
hi3 = vbicq_u8(hi3, underscore3);
|
|
}
|
|
|
|
uint8_t checks =
|
|
vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
|
|
vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask =
|
|
simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
uint64_t badcharmask = 0;
|
|
*error = checks > 0x3;
|
|
if (checks) {
|
|
// Add each of the elements next to each other, successively, to stuff each
|
|
// 8 byte mask into one.
|
|
uint8x16_t test0 = vtstq_u8(lo0, hi0);
|
|
uint8x16_t test1 = vtstq_u8(lo1, hi1);
|
|
uint8x16_t test2 = vtstq_u8(lo2, hi2);
|
|
uint8x16_t test3 = vtstq_u8(lo3, hi3);
|
|
uint8x16_t sum0 =
|
|
vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
|
|
uint8x16_t sum1 =
|
|
vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
// This is the transformation step that can be done while we are waiting for
|
|
// sum0
|
|
uint8x16_t roll_lut;
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
if (base64_url) {
|
|
roll_lut =
|
|
simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
|
|
} else {
|
|
roll_lut =
|
|
simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
|
|
}
|
|
#else
|
|
if (base64_url) {
|
|
roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
|
} else {
|
|
roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
|
}
|
|
#endif
|
|
uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
|
|
if (base64_url) {
|
|
hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
|
|
hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
|
|
hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
|
|
hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
|
|
}
|
|
uint8x16_t roll0 = vqtbl1q_u8(
|
|
roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
|
|
uint8x16_t roll1 = vqtbl1q_u8(
|
|
roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
|
|
uint8x16_t roll2 = vqtbl1q_u8(
|
|
roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
|
|
uint8x16_t roll3 = vqtbl1q_u8(
|
|
roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
|
|
b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
|
|
b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
|
|
b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
|
|
b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
|
|
return badcharmask;
|
|
}
|
|
|
|
void copy_block(block64 *b, char *output) {
|
|
vst1q_u8((uint8_t *)output, b->chunks[0]);
|
|
vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
|
|
vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
|
|
vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
|
|
}
|
|
|
|
uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
|
|
uint64_t popcounts =
|
|
vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
|
|
uint64_t offsets = popcounts * 0x0101010101010101;
|
|
compress(b->chunks[0], uint16_t(mask), output);
|
|
compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
|
|
compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
|
|
compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
|
|
return offsets >> 56;
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 64 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
void load_block(block64 *b, const char *src) {
|
|
b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
|
|
b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
|
|
b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
|
|
b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 32 bytes
|
|
// available from reading at data. It returns a 16-byte value, narrowing with
|
|
// saturation the 16-bit words.
|
|
inline uint8x16_t load_satured(const uint16_t *data) {
|
|
uint16x8_t in1 = vld1q_u16(data);
|
|
uint16x8_t in2 = vld1q_u16(data + 8);
|
|
return vqmovn_high_u16(vqmovn_u16(in1), in2);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 128 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
void load_block(block64 *b, const char16_t *src) {
|
|
b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
|
|
b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
|
|
b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
|
|
b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
|
|
}
|
|
|
|
// decode 64 bytes and output 48 bytes
|
|
void base64_decode_block(char *out, const char *src) {
|
|
uint8x16x4_t str = vld4q_u8((uint8_t *)src);
|
|
uint8x16x3_t outvec;
|
|
outvec.val[0] = vsliq_n_u8(vshrq_n_u8(str.val[1], 4), str.val[0], 2);
|
|
outvec.val[1] = vsliq_n_u8(vshrq_n_u8(str.val[2], 2), str.val[1], 4);
|
|
outvec.val[2] = vsliq_n_u8(str.val[3], str.val[2], 6);
|
|
vst3q_u8((uint8_t *)out, outvec);
|
|
}
|
|
|
|
static size_t compress_block_single(block64 *b, uint64_t mask, char *output) {
|
|
const size_t pos64 = trailing_zeroes(mask);
|
|
const int8_t pos = pos64 & 0xf;
|
|
|
|
// Predefine the index vector
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t v1 = simdutf_make_uint8x16_t(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 12, 13, 14, 15);
|
|
#else // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t v1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
|
|
switch (pos64 >> 4) {
|
|
case 0b00: {
|
|
const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
|
|
const uint8x16_t v2 =
|
|
vcgtq_s8(vreinterpretq_s8_u8(v1),
|
|
vreinterpretq_s8_u8(v0)); // Compare greater than
|
|
const uint8x16_t sh = vsubq_u8(v1, v2); // Subtract
|
|
const uint8x16_t compressed =
|
|
vqtbl1q_u8(b->chunks[0], sh); // Table lookup (shuffle)
|
|
|
|
vst1q_u8((uint8_t *)(output + 0 * 16), compressed);
|
|
vst1q_u8((uint8_t *)(output + 1 * 16 - 1), b->chunks[1]);
|
|
vst1q_u8((uint8_t *)(output + 2 * 16 - 1), b->chunks[2]);
|
|
vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]);
|
|
} break;
|
|
|
|
case 0b01: {
|
|
vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]);
|
|
|
|
const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
|
|
const uint8x16_t v2 =
|
|
vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0));
|
|
const uint8x16_t sh = vsubq_u8(v1, v2);
|
|
const uint8x16_t compressed = vqtbl1q_u8(b->chunks[1], sh);
|
|
|
|
vst1q_u8((uint8_t *)(output + 1 * 16), compressed);
|
|
vst1q_u8((uint8_t *)(output + 2 * 16 - 1), b->chunks[2]);
|
|
vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]);
|
|
} break;
|
|
|
|
case 0b10: {
|
|
vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]);
|
|
vst1q_u8((uint8_t *)(output + 1 * 16), b->chunks[1]);
|
|
|
|
const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
|
|
const uint8x16_t v2 =
|
|
vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0));
|
|
const uint8x16_t sh = vsubq_u8(v1, v2);
|
|
const uint8x16_t compressed = vqtbl1q_u8(b->chunks[2], sh);
|
|
|
|
vst1q_u8((uint8_t *)(output + 2 * 16), compressed);
|
|
vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]);
|
|
} break;
|
|
|
|
case 0b11: {
|
|
vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]);
|
|
vst1q_u8((uint8_t *)(output + 1 * 16), b->chunks[1]);
|
|
vst1q_u8((uint8_t *)(output + 2 * 16), b->chunks[2]);
|
|
|
|
const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
|
|
const uint8x16_t v2 =
|
|
vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0));
|
|
const uint8x16_t sh = vsubq_u8(v1, v2);
|
|
const uint8x16_t compressed = vqtbl1q_u8(b->chunks[3], sh);
|
|
|
|
vst1q_u8((uint8_t *)(output + 3 * 16), compressed);
|
|
} break;
|
|
}
|
|
return 63;
|
|
}
|
|
|
|
template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }
|
|
|
|
template <bool base64_url, bool ignore_garbage, typename char_type>
|
|
full_result
|
|
compress_decode_base64(char *dst, const char_type *src, size_t srclen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
size_t equallocation =
|
|
srclen; // location of the first padding character if any
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
size_t equalsigns = 0;
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 1;
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 2;
|
|
}
|
|
}
|
|
if (srclen == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
const char_type *const srcinit = src;
|
|
const char *const dstinit = dst;
|
|
const char_type *const srcend = src + srclen;
|
|
|
|
constexpr size_t block_size = 10;
|
|
char buffer[block_size * 64];
|
|
char *bufferptr = buffer;
|
|
if (srclen >= 64) {
|
|
const char_type *const srcend64 = src + srclen - 64;
|
|
while (src <= srcend64) {
|
|
block64 b;
|
|
load_block(&b, src);
|
|
src += 64;
|
|
bool error = false;
|
|
uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
|
|
if (badcharmask) {
|
|
if (error && !ignore_garbage) {
|
|
src -= 64;
|
|
while (src < srcend && scalar::base64::is_eight_byte(*src) &&
|
|
to_base64[uint8_t(*src)] <= 64) {
|
|
src++;
|
|
}
|
|
if (src < srcend) {
|
|
// should never happen
|
|
}
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
|
|
if (badcharmask != 0) {
|
|
// optimization opportunity: check for simple masks like those made of
|
|
// continuous 1s followed by continuous 0s. And masks containing a
|
|
// single bad character.
|
|
if (is_power_of_two(badcharmask)) {
|
|
bufferptr += compress_block_single(&b, badcharmask, bufferptr);
|
|
} else {
|
|
bufferptr += compress_block(&b, badcharmask, bufferptr);
|
|
}
|
|
} else {
|
|
// optimization opportunity: if bufferptr == buffer and mask == 0, we
|
|
// can avoid the call to compress_block and decode directly.
|
|
copy_block(&b, bufferptr);
|
|
bufferptr += 64;
|
|
}
|
|
if (bufferptr >= (block_size - 1) * 64 + buffer) {
|
|
for (size_t i = 0; i < (block_size - 1); i++) {
|
|
base64_decode_block(dst, buffer + i * 64);
|
|
dst += 48;
|
|
}
|
|
std::memcpy(buffer, buffer + (block_size - 1) * 64,
|
|
64); // 64 might be too much
|
|
bufferptr -= (block_size - 1) * 64;
|
|
}
|
|
}
|
|
}
|
|
char *buffer_start = buffer;
|
|
// Optimization note: if this is almost full, then it is worth our
|
|
// time, otherwise, we should just decode directly.
|
|
int last_block = (int)((bufferptr - buffer_start) % 64);
|
|
if (last_block != 0 && srcend - src + last_block >= 64) {
|
|
while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
|
|
uint8_t val = to_base64[uint8_t(*src)];
|
|
*bufferptr = char(val);
|
|
if ((!scalar::base64::is_eight_byte(*src) || val > 64) &&
|
|
!ignore_garbage) {
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
bufferptr += (val <= 63);
|
|
src++;
|
|
}
|
|
}
|
|
|
|
for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
|
|
base64_decode_block(dst, buffer_start);
|
|
dst += 48;
|
|
}
|
|
if ((bufferptr - buffer_start) % 64 != 0) {
|
|
while (buffer_start + 4 < bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
std::memcpy(dst, &triple, 4);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
if (buffer_start + 4 <= bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
// we may have 1, 2 or 3 bytes left and we need to decode them so let us
|
|
// backtrack
|
|
int leftover = int(bufferptr - buffer_start);
|
|
while (leftover > 0) {
|
|
if (!ignore_garbage) {
|
|
while (to_base64[uint8_t(*(src - 1))] == 64) {
|
|
src--;
|
|
}
|
|
} else {
|
|
while (to_base64[uint8_t(*(src - 1))] >= 64) {
|
|
src--;
|
|
}
|
|
}
|
|
src--;
|
|
leftover--;
|
|
}
|
|
}
|
|
if (src < srcend + equalsigns) {
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
dst, src, srcend - src, equalsigns, options, last_chunk_options);
|
|
r.input_count += size_t(src - srcinit);
|
|
if (r.error == error_code::INVALID_BASE64_CHARACTER ||
|
|
r.error == error_code::BASE64_EXTRA_BITS) {
|
|
return r;
|
|
} else {
|
|
r.output_count += size_t(dst - dstinit);
|
|
}
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
r.error = error_code::INVALID_BASE64_CHARACTER;
|
|
r.input_count = equallocation;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
if (equalsigns > 0 && !ignore_garbage) {
|
|
if ((size_t(dst - dstinit) % 3 == 0) ||
|
|
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
/* end file src/arm64/arm_base64.cpp */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
arm_convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *end = buf + len;
|
|
while (end - buf >= 8) {
|
|
uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
|
|
|
|
uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
|
|
if (vmaxvq_u16(utf16_packed) <= 0xff) {
|
|
// 1. pack the bytes
|
|
uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
while (end - buf >= 8) {
|
|
uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
|
|
|
|
uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
|
|
|
|
if (vmaxvq_u16(utf16_packed) <= 0xff) {
|
|
// 1. pack the bytes
|
|
uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
for (int k = 0; k < 8; k++) {
|
|
uint32_t word = buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
|
|
struct expansion_result_t {
|
|
size_t u16count;
|
|
uint8x16_t compressed_v;
|
|
};
|
|
|
|
static simdutf_really_inline uint64_t invalid_utf32(const uint32x4x2_t in) {
|
|
const auto standardmax = vdupq_n_u32(0x10ffff);
|
|
const auto v_d800 = vdupq_n_u32(0xd800);
|
|
const auto v_fffff800 = vdupq_n_u32(0xfffff800);
|
|
const auto too_large1 = vcgtq_u32(in.val[0], standardmax);
|
|
const auto too_large2 = vcgtq_u32(in.val[1], standardmax);
|
|
const auto surrogate1 = vceqq_u32(vandq_u32(in.val[0], v_fffff800), v_d800);
|
|
const auto surrogate2 = vceqq_u32(vandq_u32(in.val[1], v_fffff800), v_d800);
|
|
const auto err1 = vorrq_u32(too_large1, surrogate1);
|
|
const auto err2 = vorrq_u32(too_large2, surrogate2);
|
|
const auto err =
|
|
vuzp2q_u16(vreinterpretq_u16_u32(err1), vreinterpretq_u16_u32(err2));
|
|
|
|
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(err, 8)), 0);
|
|
}
|
|
|
|
template <endianness byte_order>
|
|
expansion_result_t neon_expand_surrogate(const uint32x4_t in) {
|
|
const uint32x4_t v_ffff0000 = vdupq_n_u32(0xffff0000);
|
|
const uint32x4_t non_surrogate_mask = vceqzq_u32(vandq_u32(in, v_ffff0000));
|
|
const uint64_t cmp_bits =
|
|
vget_lane_u64(vreinterpret_u64_u32(vshrn_n_u64(
|
|
vreinterpretq_u64_u32(non_surrogate_mask), 31)),
|
|
0);
|
|
const uint8_t mask =
|
|
uint8_t(~((cmp_bits & 0x3) | ((cmp_bits >> 30) & 0xc)) & 0xf);
|
|
const uint32x4_t v_10000 = vdupq_n_u32(0x00010000);
|
|
const uint32x4_t t0 = vsubq_u32(in, v_10000);
|
|
const uint32x4_t t1 = vandq_u32(t0, vdupq_n_u32(0xfffff));
|
|
const uint32x4_t t2 = vshrq_n_u32(t1, 10);
|
|
const uint32x4_t t3 = vsliq_n_u32(t2, t1, 16);
|
|
const uint32x4_t surrogates = vorrq_u32(
|
|
vandq_u32(t3, vdupq_n_u32(0x03ff03ff)), vdupq_n_u32(0xdc00d800));
|
|
const uint8x16_t merged =
|
|
vreinterpretq_u8_u32(vbslq_u32(non_surrogate_mask, in, surrogates));
|
|
|
|
const uint8x16_t shuffle_v = vld1q_u8(reinterpret_cast<const uint8_t *>(
|
|
(byte_order == endianness::LITTLE)
|
|
? tables::utf32_to_utf16::pack_utf32_to_utf16le[mask]
|
|
: tables::utf32_to_utf16::pack_utf32_to_utf16be[mask]));
|
|
|
|
const size_t u16count = 4 + vget_lane_u8(vcnt_u8(vcreate_u8(mask)), 0);
|
|
const uint8x16_t compressed_v = vqtbl1q_u8(merged, shuffle_v);
|
|
|
|
return {u16count, compressed_v};
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t *, char16_t *>
|
|
arm_convert_utf32_to_utf16(const char32_t *buf, size_t len,
|
|
char16_t *utf16_out) {
|
|
uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
|
|
const char32_t *end = buf + len;
|
|
|
|
uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
|
|
// To avoid buffer overflow while writing compressed_v
|
|
const size_t safety_margin = 4;
|
|
while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
|
|
uint32x4x2_t in = vld1q_u32_x2(reinterpret_cast<const uint32_t *>(buf));
|
|
|
|
// Check if no bits set above 16th
|
|
if (vmaxvq_u32(vorrq_u32(in.val[0], in.val[1])) <= 0xFFFF) {
|
|
uint16x8_t utf16_packed = vuzp1q_u16(vreinterpretq_u16_u32(in.val[0]),
|
|
vreinterpretq_u16_u32(in.val[1]));
|
|
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
forbidden_bytemask =
|
|
vorrq_u16(vceqq_u16(vandq_u16(utf16_packed, v_f800), v_d800),
|
|
forbidden_bytemask);
|
|
|
|
if (!match_system(big_endian)) {
|
|
utf16_packed = vreinterpretq_u16_u8(
|
|
vrev16q_u8(vreinterpretq_u8_u16(utf16_packed)));
|
|
}
|
|
vst1q_u16(utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
const uint64_t err = invalid_utf32(in);
|
|
if (simdutf_unlikely(err)) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
expansion_result_t res = neon_expand_surrogate<big_endian>(in.val[0]);
|
|
vst1q_u8(reinterpret_cast<uint8_t *>(utf16_output), res.compressed_v);
|
|
utf16_output += res.u16count;
|
|
res = neon_expand_surrogate<big_endian>(in.val[1]);
|
|
vst1q_u8(reinterpret_cast<uint8_t *>(utf16_output), res.compressed_v);
|
|
utf16_output += res.u16count;
|
|
buf += 8;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (vmaxvq_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
|
|
return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t *>
|
|
arm_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
|
|
char16_t *utf16_out) {
|
|
uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
// To avoid buffer overflow while writing compressed_v
|
|
const size_t safety_margin = 4;
|
|
while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
|
|
uint32x4x2_t in = vld1q_u32_x2(reinterpret_cast<const uint32_t *>(buf));
|
|
|
|
// Check if no bits set above 16th
|
|
if (vmaxvq_u32(vorrq_u32(in.val[0], in.val[1])) <= 0xFFFF) {
|
|
uint16x8_t utf16_packed = vuzp1q_u16(vreinterpretq_u16_u32(in.val[0]),
|
|
vreinterpretq_u16_u32(in.val[1]));
|
|
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t forbidden_bytemask =
|
|
vceqq_u16(vandq_u16(utf16_packed, v_f800), v_d800);
|
|
if (vmaxvq_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
|
|
if (!match_system(big_endian)) {
|
|
utf16_packed = vreinterpretq_u16_u8(
|
|
vrev16q_u8(vreinterpretq_u8_u16(utf16_packed)));
|
|
}
|
|
vst1q_u16(utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
const uint64_t err = invalid_utf32(in);
|
|
if (simdutf_unlikely(err)) {
|
|
const size_t pos = trailing_zeroes(err) / 8;
|
|
for (size_t k = 0; k < pos; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(word >> 8 | word << 8)
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate =
|
|
uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
const uint32_t word = buf[pos];
|
|
const size_t error_pos = buf - start + pos;
|
|
if (word > 0x10FFFF) {
|
|
return {result(error_code::TOO_LARGE, error_pos),
|
|
reinterpret_cast<char16_t *>(utf16_output)};
|
|
}
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return {result(error_code::SURROGATE, error_pos),
|
|
reinterpret_cast<char16_t *>(utf16_output)};
|
|
}
|
|
return {result(error_code::OTHER, error_pos),
|
|
reinterpret_cast<char16_t *>(utf16_output)};
|
|
}
|
|
expansion_result_t res = neon_expand_surrogate<big_endian>(in.val[0]);
|
|
vst1q_u8(reinterpret_cast<uint8_t *>(utf16_output), res.compressed_v);
|
|
utf16_output += res.u16count;
|
|
res = neon_expand_surrogate<big_endian>(in.val[1]);
|
|
vst1q_u8(reinterpret_cast<uint8_t *>(utf16_output), res.compressed_v);
|
|
utf16_output += res.u16count;
|
|
buf += 8;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char32_t *end = buf + len;
|
|
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
|
|
uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin < end) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
|
|
|
|
// Check if no bits set above 16th
|
|
if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
|
|
uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
|
|
if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
|
|
vbslq_u16(one_byte_bytemask, utf16_packed, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0002, 0x0008, 0x0020, 0x0080};
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
} else {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
|
|
forbidden_bytemask =
|
|
vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff),
|
|
vcgeq_u16(utf16_packed, v_d800)),
|
|
forbidden_bytemask);
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = simdutf_make_uint16x8_t(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 =
|
|
vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
|
|
vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 =
|
|
vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask =
|
|
vcleq_u16(utf16_packed, v_07ff);
|
|
const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
|
|
one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
|
|
const uint16x8_t twomask = simdutf_make_uint16x8_t(
|
|
0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
|
|
#else
|
|
const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0100, 0x0400, 0x1000, 0x4000};
|
|
const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
|
|
0x0200, 0x0800, 0x2000, 0x8000};
|
|
#endif
|
|
const uint16x8_t combined =
|
|
vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
|
|
vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
if (vmaxvq_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin < end) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
|
|
|
|
// Check if no bits set above 16th
|
|
if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
|
|
uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
|
|
if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
|
|
vbslq_u16(one_byte_bytemask, utf16_packed, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0002, 0x0008, 0x0020, 0x0080};
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
} else {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// check for invalid input
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
|
|
const uint16x8_t forbidden_bytemask = vandq_u16(
|
|
vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
|
|
if (vmaxvq_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = simdutf_make_uint16x8_t(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 =
|
|
vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
|
|
vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 =
|
|
vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask =
|
|
vcleq_u16(utf16_packed, v_07ff);
|
|
const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
|
|
one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = simdutf_make_uint16x8_t(
|
|
0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
|
|
const uint16x8_t twomask = simdutf_make_uint16x8_t(
|
|
0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
|
|
#else
|
|
const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
|
|
0x0100, 0x0400, 0x1000, 0x4000};
|
|
const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
|
|
0x0200, 0x0800, 0x2000, 0x8000};
|
|
#endif
|
|
const uint16x8_t combined =
|
|
vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
|
|
vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF8
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with
|
|
// spaces
|
|
template <size_t STEP_SIZE> struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0
|
|
* (in which case this function fills the buffer with spaces and returns 0. In
|
|
* particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
|
|
* block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text_64(const uint8_t *text) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t *>(buf));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') {
|
|
buf[i] = '_';
|
|
}
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char *format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
|
|
for (size_t i = 0; i < 64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline
|
|
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
|
|
: buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
|
|
idx{0} {}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
|
|
return idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *
|
|
buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t
|
|
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if (len == idx) {
|
|
return 0;
|
|
} // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20,
|
|
STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
|
|
// to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the
|
|
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they
|
|
// ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
0b11110000u - 1,
|
|
0b11100000u - 1,
|
|
0b11000000u - 1};
|
|
const simd8<uint8_t> max_value(
|
|
&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast
|
|
// path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is
|
|
// too short or a byte value too large in the last bytes: check_special_cases
|
|
// only checks for bytes too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an
|
|
// ASCII block can't possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
|
|
if (simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete =
|
|
is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template <class checker>
|
|
bool generic_validate_utf8(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char *input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template <class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char *input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/generic/ascii_validation.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace ascii_validation {
|
|
|
|
bool generic_validate_ascii(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
} // namespace ascii_validation
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/ascii_validation.h */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
// transcoding from UTF-8 to UTF-16
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(
|
|
in + pos, size - pos, utf16_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char16_t *utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the
|
|
// generic directory.
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the
|
|
// mask far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow
|
|
// path. Anything that is not a continuation mask is a 'leading byte',
|
|
// that is, the start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end*
|
|
// of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
input + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
|
|
input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
// transcoding from UTF-8 to UTF-32
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // we have an error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char32_t *utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while (pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
input + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
|
|
utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
// other functions
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count +
|
|
scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
|
|
ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void
|
|
change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos < size / 32 * 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/generic/utf8.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.gt(-65);
|
|
count += count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 4;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
size_t iterations = 0;
|
|
for (; pos + 4 * N <= size; pos += 4 * N) {
|
|
const auto input0 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
|
|
const auto input1 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
|
|
const auto input2 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
|
|
const auto input3 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
|
|
const auto mask0 = input0 > int8_t(-65);
|
|
const auto mask1 = input1 > int8_t(-65);
|
|
const auto mask2 = input2 > int8_t(-65);
|
|
const auto mask3 = input3 > int8_t(-65);
|
|
|
|
local -= vector_u8(mask0);
|
|
local -= vector_u8(mask1);
|
|
local -= vector_u8(mask2);
|
|
local -= vector_u8(mask3);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
#endif // SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
// transcoding from UTF-8 to Latin 1
|
|
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// For UTF-8 to Latin 1, we can allow any ASCII character, and any
|
|
// continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
|
|
// 0b11000010 and nothing else.
|
|
//
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
constexpr const uint8_t FORBIDDEN = 0xff;
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
FORBIDDEN);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
FORBIDDEN,
|
|
// ____0101 ________
|
|
FORBIDDEN,
|
|
// ____011_ ________
|
|
FORBIDDEN, FORBIDDEN,
|
|
|
|
// ____1___ ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
|
|
// ____1101 ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
this->error |= check_special_cases(input, prev1);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 16; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
latin1_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin1_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t convert_valid(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
|
|
// 16 bytes, and if the data is valid, then it is entirely safe because 16
|
|
// UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
|
|
// assume that you have valid UTF-8 input, so we are going to go back from the
|
|
// end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
|
|
latin1_output);
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
// namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
// todo: reimplement as a one-pass algorithm.
|
|
int out = 0;
|
|
if (validate_utf8(input, length)) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if ((length % 2) == 0) {
|
|
if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
|
|
length / 2)) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
}
|
|
if ((length % 4) == 0) {
|
|
if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return arm64::utf8_validation::generic_validate_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return arm64::ascii_validation::generic_validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return arm64::ascii_validation::generic_validate_ascii_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const char16_t *tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::LITTLE>(tail,
|
|
len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const char16_t *tail = arm_validate_utf16<endianness::BIG>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
|
|
buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
|
|
buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_neon_64bits<endianness::LITTLE>(input, len, output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_neon_64bits<endianness::BIG>(input, len, output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const char32_t *tail = arm_validate_utf32le(buf, len);
|
|
if (tail) {
|
|
return scalar::utf32::validate(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
result res = arm_validate_utf32le_with_errors(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char *, char *> ret =
|
|
arm_convert_latin1_to_utf8(buf, len, utf8_output);
|
|
size_t converted_chars = ret.second - utf8_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
arm_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
arm_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char *, char32_t *> ret =
|
|
arm_convert_latin1_to_utf32(buf, len, utf32_output);
|
|
size_t converted_chars = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *input, size_t size, char32_t *utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
arm_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
arm_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
arm_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
|
|
buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
arm_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
|
|
utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
|
|
utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return 0;
|
|
}
|
|
std::pair<const char32_t *, char *> ret =
|
|
arm_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
|
|
utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
arm_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
arm_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t length) const noexcept {
|
|
// See
|
|
// https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
|
|
// credit to Pete Cawley
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
|
|
uint64_t result = 0;
|
|
const int lanes = sizeof(uint8x16_t);
|
|
uint8_t rem = length % lanes;
|
|
const uint8_t *simd_end = data + (length / lanes) * lanes;
|
|
const uint8x16_t threshold = vdupq_n_u8(0x80);
|
|
for (; data < simd_end; data += lanes) {
|
|
// load 16 bytes
|
|
uint8x16_t input_vec = vld1q_u8(data);
|
|
// compare to threshold (0x80)
|
|
uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
|
|
// vertical addition
|
|
result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
|
|
}
|
|
return result + (length / lanes) * lanes +
|
|
scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return arm64_utf8_length_from_utf16_bytemask<endianness::LITTLE>(input,
|
|
length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return arm64_utf8_length_from_utf16_bytemask<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
|
|
const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
|
|
const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
|
|
const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 4 <= length; pos += 4) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
|
|
const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
|
|
const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
|
|
const uint32x4_t two_bytes_bytemask =
|
|
veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
|
|
const uint32x4_t three_bytes_bytemask =
|
|
veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
|
|
|
|
const uint16x8_t reduced_ascii_bytes_bytemask =
|
|
vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
|
|
const uint16x8_t reduced_two_bytes_bytemask =
|
|
vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
|
|
const uint16x8_t reduced_three_bytes_bytemask =
|
|
vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
|
|
|
|
const uint16x8_t compressed_bytemask0 =
|
|
vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
|
|
const uint16x8_t compressed_bytemask1 =
|
|
vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
|
|
|
|
size_t ascii_count = count_ones(
|
|
vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
|
|
size_t two_bytes_count = count_ones(
|
|
vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
|
|
size_t three_bytes_count = count_ones(
|
|
vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
|
|
|
|
count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
|
|
}
|
|
return count +
|
|
scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
|
|
const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 4 <= length; pos += 4) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
|
|
const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
|
|
const uint16x8_t reduced_bytemask =
|
|
vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
|
|
const uint16x8_t compressed_bytemask =
|
|
vpaddq_u16(reduced_bytemask, reduced_bytemask);
|
|
size_t surrogate_count = count_ones(
|
|
vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
|
|
count += 4 + surrogate_count;
|
|
}
|
|
return count +
|
|
scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
return encode_base64(output, input, length, options);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/arm64/end.h */
|
|
#undef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
/* end file src/simdutf/arm64/end.h */
|
|
/* end file src/arm64/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
/* begin file src/fallback/implementation.cpp */
|
|
/* begin file src/simdutf/fallback/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "fallback"
|
|
// #define SIMDUTF_IMPLEMENTATION fallback
|
|
/* end file src/simdutf/fallback/begin.h */
|
|
|
|
namespace simdutf {
|
|
namespace fallback {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
int out = 0;
|
|
// todo: reimplement as a one-pass algorithm.
|
|
if (validate_utf8(input, length)) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if ((length % 2) == 0) {
|
|
if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
|
|
length / 2)) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
}
|
|
if ((length % 4) == 0) {
|
|
if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return scalar::utf8::validate(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return scalar::utf8::validate_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return scalar::ascii::validate(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return scalar::ascii::validate_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
return scalar::utf16::validate<endianness::BIG>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(input, len,
|
|
output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::BIG>(input, len,
|
|
output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
return scalar::utf32::validate(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
return scalar::utf32::validate_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *input, size_t size, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
|
|
buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
|
|
buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(
|
|
buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
|
|
utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
|
|
utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
|
|
utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
|
|
buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
|
|
utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
|
|
buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
|
|
utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
scalar::utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return scalar::utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *input, size_t length) const noexcept {
|
|
return scalar::utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return scalar::utf8::count_code_points(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t length) const noexcept {
|
|
size_t answer = length;
|
|
size_t i = 0;
|
|
auto pop = [](uint64_t v) {
|
|
return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
|
|
UINT64_C(0x0101010101010101) >>
|
|
56);
|
|
};
|
|
for (; i + 32 <= length; i += 32) {
|
|
uint64_t v;
|
|
memcpy(&v, input + i, 8);
|
|
answer += pop(v);
|
|
memcpy(&v, input + i + 8, sizeof(v));
|
|
answer += pop(v);
|
|
memcpy(&v, input + i + 16, sizeof(v));
|
|
answer += pop(v);
|
|
memcpy(&v, input + i + 24, sizeof(v));
|
|
answer += pop(v);
|
|
}
|
|
for (; i + 8 <= length; i += 8) {
|
|
uint64_t v;
|
|
memcpy(&v, input + i, sizeof(v));
|
|
answer += pop(v);
|
|
}
|
|
for (; i + 1 <= length; i += 1) {
|
|
answer += static_cast<uint8_t>(input[i]) >> 7;
|
|
}
|
|
return answer;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
|
|
length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
|
|
length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return scalar::utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return scalar::utf32::utf16_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return scalar::utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
return {SUCCESS, 0};
|
|
}
|
|
result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
return {SUCCESS, 0};
|
|
}
|
|
result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
return scalar::base64::tail_encode_base64(output, input, length, options);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/fallback/end.h */
|
|
/* end file src/simdutf/fallback/end.h */
|
|
/* end file src/fallback/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
/* begin file src/icelake/implementation.cpp */
|
|
#include <tuple>
|
|
#include <utility>
|
|
|
|
/* begin file src/simdutf/icelake/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "icelake"
|
|
// #define SIMDUTF_IMPLEMENTATION icelake
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_ICELAKE
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
// clang-format off
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
// clang-format on
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/begin.h */
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
namespace {
|
|
#ifndef SIMDUTF_ICELAKE_H
|
|
#error "icelake.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
/* begin file src/icelake/icelake_macros.inl.cpp */
|
|
|
|
/*
|
|
This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a
|
|
UTF-8 string) and loads all possible 4-byte substring into an AVX512
|
|
register.
|
|
|
|
For example if we have bytes abcdefgh... we create following 32-bit lanes
|
|
|
|
[abcd|bcde|cdef|defg|efgh|...]
|
|
^ ^
|
|
byte 0 of reg byte 63 of reg
|
|
*/
|
|
/** pshufb
|
|
# lane{0,1,2} have got bytes: [ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10,
|
|
11, 12, 13, 14, 15] # lane3 has got bytes: [ 16, 17, 18, 19, 4, 5,
|
|
6, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
|
|
expand_ver2 = [
|
|
# lane 0:
|
|
0, 1, 2, 3,
|
|
1, 2, 3, 4,
|
|
2, 3, 4, 5,
|
|
3, 4, 5, 6,
|
|
|
|
# lane 1:
|
|
4, 5, 6, 7,
|
|
5, 6, 7, 8,
|
|
6, 7, 8, 9,
|
|
7, 8, 9, 10,
|
|
|
|
# lane 2:
|
|
8, 9, 10, 11,
|
|
9, 10, 11, 12,
|
|
10, 11, 12, 13,
|
|
11, 12, 13, 14,
|
|
|
|
# lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16,
|
|
17, 18, 19 12, 13, 14, 15, 13, 14, 15, 0, 14, 15, 0, 1, 15, 0, 1, 2,
|
|
]
|
|
*/
|
|
|
|
#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED) \
|
|
{ \
|
|
const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1); \
|
|
const __m512i expand_ver2 = _mm512_setr_epi64( \
|
|
0x0403020103020100, 0x0605040305040302, 0x0807060507060504, \
|
|
0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a, \
|
|
0x000f0e0d0f0e0d0c, 0x0201000f01000f0e); \
|
|
const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); \
|
|
\
|
|
__mmask16 leading_bytes; \
|
|
const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); \
|
|
const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); \
|
|
const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); \
|
|
leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); \
|
|
\
|
|
__m512i char_class; \
|
|
char_class = _mm512_srli_epi32(input, 4); \
|
|
/* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ \
|
|
const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); \
|
|
const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); \
|
|
char_class = \
|
|
_mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
|
|
\
|
|
const int valid_count = static_cast<int>(count_ones(leading_bytes)); \
|
|
const __m512i utf32 = expanded_utf8_to_utf32(char_class, input); \
|
|
\
|
|
const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), \
|
|
leading_bytes, utf32); \
|
|
\
|
|
if (UTF32) { \
|
|
if (MASKED) { \
|
|
const __mmask16 valid = uint16_t((1 << valid_count) - 1); \
|
|
_mm512_mask_storeu_epi32((__m512i *)output, valid, out); \
|
|
} else { \
|
|
_mm512_storeu_si512((__m512i *)output, out); \
|
|
} \
|
|
output += valid_count; \
|
|
} else { \
|
|
if (MASKED) { \
|
|
output += utf32_to_utf16_masked<big_endian>( \
|
|
byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
|
|
} else { \
|
|
output += utf32_to_utf16<big_endian>( \
|
|
byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED) \
|
|
{ \
|
|
if (UTF32) { \
|
|
if (MASKED) { \
|
|
const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1); \
|
|
_mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT); \
|
|
} else { \
|
|
_mm512_storeu_si512((__m512i *)output, INPUT); \
|
|
} \
|
|
output += VALID_COUNT; \
|
|
} else { \
|
|
if (MASKED) { \
|
|
output += utf32_to_utf16_masked<big_endian>( \
|
|
byteflip, INPUT, VALID_COUNT, \
|
|
reinterpret_cast<char16_t *>(output)); \
|
|
} else { \
|
|
output += \
|
|
utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, \
|
|
reinterpret_cast<char16_t *>(output)); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) \
|
|
if (UTF32) { \
|
|
const __m128i t0 = _mm512_castsi512_si128(utf8); \
|
|
const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1); \
|
|
const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2); \
|
|
const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3); \
|
|
_mm512_storeu_si512((__m512i *)(output + 0 * 16), \
|
|
_mm512_cvtepu8_epi32(t0)); \
|
|
_mm512_storeu_si512((__m512i *)(output + 1 * 16), \
|
|
_mm512_cvtepu8_epi32(t1)); \
|
|
_mm512_storeu_si512((__m512i *)(output + 2 * 16), \
|
|
_mm512_cvtepu8_epi32(t2)); \
|
|
_mm512_storeu_si512((__m512i *)(output + 3 * 16), \
|
|
_mm512_cvtepu8_epi32(t3)); \
|
|
} else { \
|
|
const __m256i h0 = _mm512_castsi512_si256(utf8); \
|
|
const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1); \
|
|
if (big_endian) { \
|
|
_mm512_storeu_si512( \
|
|
(__m512i *)(output + 0 * 16), \
|
|
_mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
|
|
_mm512_storeu_si512( \
|
|
(__m512i *)(output + 2 * 16), \
|
|
_mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
|
|
} else { \
|
|
_mm512_storeu_si512((__m512i *)(output + 0 * 16), \
|
|
_mm512_cvtepu8_epi16(h0)); \
|
|
_mm512_storeu_si512((__m512i *)(output + 2 * 16), \
|
|
_mm512_cvtepu8_epi16(h1)); \
|
|
} \
|
|
}
|
|
/* end file src/icelake/icelake_macros.inl.cpp */
|
|
/* begin file src/icelake/icelake_common.inl.cpp */
|
|
// file included directly
|
|
/**
|
|
* Store the last N bytes of previous followed by 512-N bytes from input.
|
|
*/
|
|
template <int N> __m512i prev(__m512i input, __m512i previous) {
|
|
static_assert(N <= 32, "N must be no larger than 32");
|
|
const __m512i movemask =
|
|
_mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
|
|
const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
|
|
#if SIMDUTF_GCC8 || SIMDUTF_GCC9
|
|
constexpr int shift = 16 - N; // workaround for GCC8,9
|
|
return _mm512_alignr_epi8(input, rotated, shift);
|
|
#else
|
|
return _mm512_alignr_epi8(input, rotated, 16 - N);
|
|
#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
|
|
}
|
|
|
|
template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
|
|
__m512i shuffle_epi128(__m512i v) {
|
|
static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
|
|
static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
|
|
static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
|
|
static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
|
|
|
|
constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
|
|
return _mm512_shuffle_i32x4(v, v, shuffle);
|
|
}
|
|
|
|
template <unsigned idx> constexpr __m512i broadcast_epi128(__m512i v) {
|
|
return shuffle_epi128<idx, idx, idx, idx>(v);
|
|
}
|
|
|
|
simdutf_really_inline __m512i broadcast_128bit_lane(__m128i lane) {
|
|
const __m512i tmp = _mm512_castsi128_si512(lane);
|
|
|
|
return broadcast_epi128<0>(tmp);
|
|
}
|
|
/* end file src/icelake/icelake_common.inl.cpp */
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/icelake/icelake_utf8_common.inl.cpp */
|
|
// Common procedures for both validating and non-validating conversions from
|
|
// UTF-8.
|
|
enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL };
|
|
|
|
using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
|
|
using utf8_to_utf32_result = std::pair<const char *, uint32_t *>;
|
|
|
|
/*
|
|
process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
|
|
to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
|
|
might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
|
|
indicates how many input bytes are relevant.
|
|
|
|
Returns true when the result is correct, otherwise it returns false.
|
|
|
|
The provided in and out pointers are advanced according to how many input
|
|
bytes have been processed, upon success.
|
|
*/
|
|
template <block_processing_mode tail, endianness big_endian>
|
|
simdutf_really_inline bool
|
|
process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
|
|
// constants
|
|
__m512i mask_identity = _mm512_set_epi8(
|
|
63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
|
|
45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
|
|
27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,
|
|
8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
__m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
|
|
__m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
|
|
__m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
|
|
__m512i mask_dfdfdfdf_tail = _mm512_set_epi64(
|
|
0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
|
|
0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
|
|
0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
|
|
__m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
|
|
__m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
|
|
__m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
|
|
__m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
// Note that 'tail' is a compile-time constant !
|
|
__mmask64 b =
|
|
(tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
|
|
__m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in)
|
|
: _mm512_maskz_loadu_epi8(b, in);
|
|
__mmask64 m1 = (tail == SIMDUTF_FULL)
|
|
? _mm512_cmplt_epu8_mask(input, mask_80808080)
|
|
: _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
|
|
if (_ktestc_mask64_u8(m1,
|
|
b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
|
|
// alternatively, we could do 'if (m1 == b) { '
|
|
if (tail == SIMDUTF_FULL) {
|
|
in += 64; // consumed 64 bytes
|
|
// we convert a full 64-byte block, writing 128 bytes.
|
|
__m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
|
|
if (big_endian) {
|
|
input1 = _mm512_shuffle_epi8(input1, byteflip);
|
|
}
|
|
_mm512_storeu_si512(out, input1);
|
|
out += 32;
|
|
__m512i input2 =
|
|
_mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
|
|
if (big_endian) {
|
|
input2 = _mm512_shuffle_epi8(input2, byteflip);
|
|
}
|
|
_mm512_storeu_si512(out, input2);
|
|
out += 32;
|
|
return true; // we are done
|
|
} else {
|
|
in += gap;
|
|
if (gap <= 32) {
|
|
__m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
|
|
if (big_endian) {
|
|
input1 = _mm512_shuffle_epi8(input1, byteflip);
|
|
}
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1),
|
|
input1);
|
|
out += gap;
|
|
} else {
|
|
__m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
|
|
if (big_endian) {
|
|
input1 = _mm512_shuffle_epi8(input1, byteflip);
|
|
}
|
|
_mm512_storeu_si512(out, input1);
|
|
out += 32;
|
|
__m512i input2 =
|
|
_mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
|
|
if (big_endian) {
|
|
input2 = _mm512_shuffle_epi8(input2, byteflip);
|
|
}
|
|
_mm512_mask_storeu_epi16(
|
|
out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
|
|
out += gap - 32;
|
|
}
|
|
return true; // we are done
|
|
}
|
|
}
|
|
// classify characters further
|
|
__mmask64 m234 = _mm512_cmp_epu8_mask(
|
|
mask_c0c0c0c0, input,
|
|
_MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
|
|
__mmask64 m34 =
|
|
_mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
|
|
_MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte
|
|
|
|
__mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(
|
|
m234, input, mask_c2c2c2c2,
|
|
_MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
|
|
// Overlong 2-byte sequence
|
|
if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
|
|
// Overlong 2-byte sequence
|
|
return false;
|
|
}
|
|
if (_ktestz_mask64_u8(m34, m34) == 0) {
|
|
// We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a
|
|
// 4-byte sequence!
|
|
__mmask64 m4 = _mm512_cmp_epu8_mask(
|
|
input, mask_f0f0f0f0,
|
|
_MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
|
|
|
|
__mmask64 mask_not_ascii = (tail == SIMDUTF_FULL)
|
|
? _knot_mask64(m1)
|
|
: _kand_mask64(_knot_mask64(m1), b);
|
|
|
|
__mmask64 mp1 = _kshiftli_mask64(m234, 1);
|
|
__mmask64 mp2 = _kshiftli_mask64(m34, 2);
|
|
// We could do it as follows...
|
|
// if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit
|
|
// masks a and b and return 1 if all zeroes but GCC generates better code
|
|
// when we do:
|
|
if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and
|
|
// return 1 if all zeroes
|
|
// Fast path with 1,2,3 bytes
|
|
__mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
|
|
__mmask64 m1234 = _kor_mask64(m1, m234);
|
|
// mismatched continuation bytes:
|
|
if (tail == SIMDUTF_FULL) {
|
|
__mmask64 xnormcm1234 = _kxnor_mask64(
|
|
mc,
|
|
m1234); // XNOR of mc and m1234 should be all zero if they differ
|
|
// the presence of a 1 bit indicates that they overlap.
|
|
// _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return
|
|
// 1 if all zeroes.
|
|
if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
|
|
return false;
|
|
}
|
|
} else {
|
|
__mmask64 bxorm1234 = _kxor_mask64(b, m1234);
|
|
if (mc != bxorm1234) {
|
|
return false;
|
|
}
|
|
}
|
|
// mend: identifying the last bytes of each sequence to be decoded
|
|
__mmask64 mend = _kshiftri_mask64(m1234, 1);
|
|
if (tail != SIMDUTF_FULL) {
|
|
mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
|
|
}
|
|
|
|
__m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
|
|
__m512i last_and_thirdu16 =
|
|
_mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
|
|
|
|
__m512i nonasciitags = _mm512_maskz_mov_epi8(
|
|
mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000
|
|
__m512i clearedbytes = _mm512_andnot_si512(
|
|
nonasciitags, input); // high two bits cleared where not ASCII
|
|
__m512i lastbytes = _mm512_maskz_permutexvar_epi8(
|
|
0x5555555555555555, last_and_thirdu16,
|
|
clearedbytes); // the last byte of each character
|
|
|
|
__mmask64 mask_before_non_ascii = _kshiftri_mask64(
|
|
mask_not_ascii, 1); // bytes that precede non-ASCII bytes
|
|
__m512i indexofsecondlastbytes = _mm512_add_epi16(
|
|
mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
|
|
__m512i beforeasciibytes =
|
|
_mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
|
|
__m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
|
|
0x5555555555555555, indexofsecondlastbytes,
|
|
beforeasciibytes); // the second last bytes (of two, three byte seq,
|
|
// surrogates)
|
|
secondlastbytes =
|
|
_mm512_slli_epi16(secondlastbytes, 6); // shifted into position
|
|
|
|
__m512i indexofthirdlastbytes = _mm512_add_epi16(
|
|
mask_ffffffff,
|
|
indexofsecondlastbytes); // indices of the second last bytes
|
|
__m512i thirdlastbyte =
|
|
_mm512_maskz_mov_epi8(m34,
|
|
clearedbytes); // only those that are the third
|
|
// last byte of a sequence
|
|
__m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
|
|
0x5555555555555555, indexofthirdlastbytes,
|
|
thirdlastbyte); // the third last bytes (of three byte sequences, hi
|
|
// surrogate)
|
|
thirdlastbytes =
|
|
_mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
|
|
__m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes,
|
|
thirdlastbytes, 254);
|
|
// the elements of Wout excluding the last element if it happens to be a
|
|
// high surrogate:
|
|
|
|
__mmask64 mprocessed =
|
|
(tail == SIMDUTF_FULL)
|
|
? _pdep_u64(0xFFFFFFFF, mend)
|
|
: _pdep_u64(
|
|
0xFFFFFFFF,
|
|
_kand_mask64(
|
|
mend, b)); // we adjust mend at the end of the output.
|
|
|
|
// Encodings out of range...
|
|
{
|
|
// the location of 3-byte sequence start bytes in the input
|
|
__mmask64 m3 = m34 & (b ^ m4);
|
|
// code units in Wout corresponding to 3-byte sequences.
|
|
__mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
|
|
__m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
|
|
__mmask32 Msmall800 =
|
|
_mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
|
|
__m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
|
|
__m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
|
|
__mmask32 M3s =
|
|
_mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
|
|
if (_kor_mask32(Msmall800, M3s)) {
|
|
return false;
|
|
}
|
|
}
|
|
int64_t nout = _mm_popcnt_u64(mprocessed);
|
|
in += 64 - _lzcnt_u64(mprocessed);
|
|
if (big_endian) {
|
|
Wout = _mm512_shuffle_epi8(Wout, byteflip);
|
|
}
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
|
|
out += nout;
|
|
return true; // ok
|
|
}
|
|
//
|
|
// We have a 4-byte sequence, this is the general case.
|
|
// Slow!
|
|
__mmask64 mp3 = _kshiftli_mask64(m4, 3);
|
|
__mmask64 mc =
|
|
_kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
|
|
__mmask64 m1234 = _kor_mask64(m1, m234);
|
|
|
|
// mend: identifying the last bytes of each sequence to be decoded
|
|
__mmask64 mend =
|
|
_kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
|
|
if (tail != SIMDUTF_FULL) {
|
|
mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
|
|
}
|
|
__m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
|
|
__m512i last_and_thirdu16 =
|
|
_mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
|
|
|
|
__m512i nonasciitags = _mm512_maskz_mov_epi8(
|
|
mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000
|
|
__m512i clearedbytes = _mm512_andnot_si512(
|
|
nonasciitags, input); // high two bits cleared where not ASCII
|
|
__m512i lastbytes = _mm512_maskz_permutexvar_epi8(
|
|
0x5555555555555555, last_and_thirdu16,
|
|
clearedbytes); // the last byte of each character
|
|
|
|
__mmask64 mask_before_non_ascii = _kshiftri_mask64(
|
|
mask_not_ascii, 1); // bytes that precede non-ASCII bytes
|
|
__m512i indexofsecondlastbytes = _mm512_add_epi16(
|
|
mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
|
|
__m512i beforeasciibytes =
|
|
_mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
|
|
__m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
|
|
0x5555555555555555, indexofsecondlastbytes,
|
|
beforeasciibytes); // the second last bytes (of two, three byte seq,
|
|
// surrogates)
|
|
secondlastbytes =
|
|
_mm512_slli_epi16(secondlastbytes, 6); // shifted into position
|
|
|
|
__m512i indexofthirdlastbytes = _mm512_add_epi16(
|
|
mask_ffffffff,
|
|
indexofsecondlastbytes); // indices of the second last bytes
|
|
__m512i thirdlastbyte = _mm512_maskz_mov_epi8(
|
|
m34,
|
|
clearedbytes); // only those that are the third last byte of a sequence
|
|
__m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
|
|
0x5555555555555555, indexofthirdlastbytes,
|
|
thirdlastbyte); // the third last bytes (of three byte sequences, hi
|
|
// surrogate)
|
|
thirdlastbytes =
|
|
_mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
|
|
__m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(
|
|
lastbytes, secondlastbytes, thirdlastbytes, 254);
|
|
uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
|
|
__mmask32 Mlo = __mmask32(Mlo_uint64);
|
|
__mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
|
|
__m512i lo_surr_mask = _mm512_maskz_mov_epi16(
|
|
Mlo,
|
|
mask_dc00dc00); // lo surr: 1101110000000000, other: 0000000000000000
|
|
__m512i shifted4_thirdsecondandlastbytes =
|
|
_mm512_srli_epi16(thirdsecondandlastbytes,
|
|
4); // hi surr: 00000WVUTSRQPNML vuts = WVUTS - 1
|
|
__m512i tagged_lo_surrogates = _mm512_or_si512(
|
|
thirdsecondandlastbytes,
|
|
lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other: unchanged
|
|
__m512i Wout = _mm512_mask_add_epi16(
|
|
tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
|
|
mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other: unchanged
|
|
// the elements of Wout excluding the last element if it happens to be a
|
|
// high surrogate:
|
|
__mmask32 Mout = ~(Mhi & 0x80000000);
|
|
__mmask64 mprocessed =
|
|
(tail == SIMDUTF_FULL)
|
|
? _pdep_u64(Mout, mend)
|
|
: _pdep_u64(
|
|
Mout,
|
|
_kand_mask64(mend,
|
|
b)); // we adjust mend at the end of the output.
|
|
|
|
// mismatched continuation bytes:
|
|
if (tail == SIMDUTF_FULL) {
|
|
__mmask64 xnormcm1234 = _kxnor_mask64(
|
|
mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
|
|
// the presence of a 1 bit indicates that they overlap.
|
|
// _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1
|
|
// if all zeroes.
|
|
if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
|
|
return false;
|
|
}
|
|
} else {
|
|
__mmask64 bxorm1234 = _kxor_mask64(b, m1234);
|
|
if (mc != bxorm1234) {
|
|
return false;
|
|
}
|
|
}
|
|
// Encodings out of range...
|
|
{
|
|
// the location of 3-byte sequence start bytes in the input
|
|
__mmask64 m3 = m34 & (b ^ m4);
|
|
// code units in Wout corresponding to 3-byte sequences.
|
|
__mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
|
|
__m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
|
|
__mmask32 Msmall800 =
|
|
_mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
|
|
__m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
|
|
__m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
|
|
__mmask32 M3s =
|
|
_mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
|
|
__m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
|
|
__mmask32 M4s =
|
|
_mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
|
|
if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
|
|
return false;
|
|
}
|
|
}
|
|
in += 64 - _lzcnt_u64(mprocessed);
|
|
int64_t nout = _mm_popcnt_u64(mprocessed);
|
|
if (big_endian) {
|
|
Wout = _mm512_shuffle_epi8(Wout, byteflip);
|
|
}
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
|
|
out += nout;
|
|
return true; // ok
|
|
}
|
|
// Fast path 2: all ASCII or 2 byte
|
|
__mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL)
|
|
? _knot_mask64(m234)
|
|
: _kand_mask64(_knot_mask64(m234), b);
|
|
// on top of -0xc0 we subtract -2 which we get back later of the
|
|
// continuation byte tags
|
|
__m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
|
|
__mmask64 leading = tail == (tail == SIMDUTF_FULL)
|
|
? _kor_mask64(m1, m234)
|
|
: _kand_mask64(_kor_mask64(m1, m234),
|
|
b); // first bytes of each sequence
|
|
if (tail == SIMDUTF_FULL) {
|
|
__mmask64 xnor234leading =
|
|
_kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
|
|
if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
|
|
return false;
|
|
}
|
|
} else {
|
|
__mmask64 bxorleading = _kxor_mask64(b, leading);
|
|
if (_kshiftli_mask64(m234, 1) != bxorleading) {
|
|
return false;
|
|
}
|
|
}
|
|
//
|
|
if (tail == SIMDUTF_FULL) {
|
|
// In the two-byte/ASCII scenario, we are easily latency bound, so we want
|
|
// to increment the input buffer as quickly as possible.
|
|
// We process 32 bytes unless the byte at index 32 is a continuation byte,
|
|
// in which case we include it as well for a total of 33 bytes.
|
|
// Note that if x is an ASCII byte, then the following is false:
|
|
// int8_t(x) <= int8_t(0xc0) under two's complement.
|
|
in += 32;
|
|
if (int8_t(*in) <= int8_t(0xc0))
|
|
in++;
|
|
// The alternative is to do
|
|
// in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
|
|
// but it requires loading the input, doing the mask computation, and
|
|
// converting back the mask to a general register. It just takes too long,
|
|
// leaving the processor likely to be idle.
|
|
} else {
|
|
in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
|
|
}
|
|
__m512i lead = _mm512_maskz_compress_epi8(
|
|
leading, leading2byte); // will contain zero for ascii, and the data
|
|
lead = _mm512_cvtepu8_epi16(
|
|
_mm512_castsi512_si256(lead)); // ... zero extended into code units
|
|
__m512i follow = _mm512_maskz_compress_epi8(
|
|
continuation_or_ascii, input); // the last bytes of each sequence
|
|
follow = _mm512_cvtepu8_epi16(
|
|
_mm512_castsi512_si256(follow)); // ... zero extended into code units
|
|
lead = _mm512_slli_epi16(lead, 6); // shifted into position
|
|
__m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
|
|
|
|
if (big_endian) {
|
|
final = _mm512_shuffle_epi8(final, byteflip);
|
|
}
|
|
if (tail == SIMDUTF_FULL) {
|
|
// Next part is UTF-16 specific and can be generalized to UTF-32.
|
|
int nout = _mm_popcnt_u32(uint32_t(leading));
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
|
|
out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
|
|
} else {
|
|
int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
|
|
out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
|
|
}
|
|
|
|
return true; // we are fine.
|
|
}
|
|
|
|
/*
|
|
utf32_to_utf16_masked converts `count` lower UTF-32 code units
|
|
from input `utf32` into UTF-16. It differs from utf32_to_utf16
|
|
in that it 'masks' the writes.
|
|
|
|
Returns how many 16-bit code units were stored.
|
|
|
|
byteflip is used for flipping 16-bit code units, and it should be
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
We pass it to the (always inlined) function to encourage the compiler to
|
|
keep the value in a (constant) register.
|
|
*/
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip,
|
|
__m512i utf32,
|
|
unsigned int count,
|
|
char16_t *output) {
|
|
|
|
const __mmask16 valid = uint16_t((1 << count) - 1);
|
|
// 1. check if we have any surrogate pairs
|
|
const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
|
|
const __mmask16 sp_mask =
|
|
_mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
|
|
|
|
if (sp_mask == 0) {
|
|
if (big_endian) {
|
|
_mm256_mask_storeu_epi16(
|
|
(__m256i *)output, valid,
|
|
_mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
|
|
_mm512_castsi512_si256(byteflip)));
|
|
|
|
} else {
|
|
_mm256_mask_storeu_epi16((__m256i *)output, valid,
|
|
_mm512_cvtepi32_epi16(utf32));
|
|
}
|
|
return count;
|
|
}
|
|
|
|
{
|
|
// build surrogate pair code units in 32-bit lanes
|
|
|
|
// t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
|
|
const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
|
|
const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
|
|
|
|
// t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
|
|
const __m512i t1 = _mm512_slli_epi32(t0, 6);
|
|
|
|
// t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
|
|
// to t0
|
|
// 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
|
|
const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
|
|
const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
|
|
|
|
// t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
|
|
// to t0
|
|
// 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
|
|
const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
|
|
const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
|
|
const __m512i t3 =
|
|
_mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
|
|
const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
|
|
__m512i t5 = _mm512_ror_epi32(t4, 16);
|
|
// Here we want to trim all of the upper 16-bit code units from the 2-byte
|
|
// characters represented as 4-byte values. We can compute it from
|
|
// sp_mask or the following... It can be more optimized!
|
|
const __mmask32 nonzero = _kor_mask32(
|
|
0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
|
|
const __mmask32 nonzero_masked =
|
|
_kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
|
|
if (big_endian) {
|
|
t5 = _mm512_shuffle_epi8(t5, byteflip);
|
|
}
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
|
|
// (AMD Zen4 has terrible performance with it, it is effectively broken)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
|
|
_mm512_mask_storeu_epi16(
|
|
output, _bzhi_u32(0xFFFFFFFF, count + _mm_popcnt_u32(sp_mask)),
|
|
compressed);
|
|
//_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
|
|
}
|
|
|
|
return count + static_cast<unsigned int>(count_ones(sp_mask));
|
|
}
|
|
|
|
/*
|
|
utf32_to_utf16 converts `count` lower UTF-32 code units
|
|
from input `utf32` into UTF-16. It may overflow.
|
|
|
|
Returns how many 16-bit code units were stored.
|
|
|
|
byteflip is used for flipping 16-bit code units, and it should be
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
We pass it to the (always inlined) function to encourage the compiler to
|
|
keep the value in a (constant) register.
|
|
*/
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip,
|
|
__m512i utf32, unsigned int count,
|
|
char16_t *output) {
|
|
// check if we have any surrogate pairs
|
|
const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
|
|
const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
|
|
|
|
if (sp_mask == 0) {
|
|
// technically, it should be _mm256_storeu_epi16
|
|
if (big_endian) {
|
|
_mm256_storeu_si256(
|
|
(__m256i *)output,
|
|
_mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
|
|
_mm512_castsi512_si256(byteflip)));
|
|
} else {
|
|
_mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32));
|
|
}
|
|
return count;
|
|
}
|
|
|
|
{
|
|
// build surrogate pair code units in 32-bit lanes
|
|
|
|
// t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
|
|
const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
|
|
const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
|
|
|
|
// t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
|
|
const __m512i t1 = _mm512_slli_epi32(t0, 6);
|
|
|
|
// t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
|
|
// to t0
|
|
// 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
|
|
const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
|
|
const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
|
|
|
|
// t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
|
|
// to t0
|
|
// 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
|
|
const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
|
|
const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
|
|
const __m512i t3 =
|
|
_mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
|
|
const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
|
|
__m512i t5 = _mm512_ror_epi32(t4, 16);
|
|
const __mmask32 nonzero = _kor_mask32(
|
|
0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
|
|
if (big_endian) {
|
|
t5 = _mm512_shuffle_epi8(t5, byteflip);
|
|
}
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
|
|
// (zen4)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
|
|
_mm512_mask_storeu_epi16(
|
|
output,
|
|
(1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
|
|
compressed);
|
|
//_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
|
|
}
|
|
|
|
return count + static_cast<unsigned int>(count_ones(sp_mask));
|
|
}
|
|
|
|
/*
|
|
expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
|
|
stored at separate 32-bit lanes.
|
|
|
|
For each lane we have also a character class (`char_class), given in form
|
|
0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
|
|
corresponding bytes during pshufb.
|
|
*/
|
|
simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class,
|
|
__m512i utf8) {
|
|
/*
|
|
Input:
|
|
- utf8: bytes stored at separate 32-bit code units
|
|
- valid: which code units have valid UTF-8 characters
|
|
|
|
Bit layout of single word. We show 4 cases for each possible
|
|
UTF-8 character encoding. The `?` denotes bits we must not
|
|
assume their value.
|
|
|
|
|10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
|
|
|????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
|
|
|????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
|
|
|????.????|????.????|????.????|0aaa.aaaa| ASCII char
|
|
byte 3 byte 2 byte 1 byte 0
|
|
*/
|
|
|
|
/* 1. Reset control bits of continuation bytes and the MSB
|
|
of the leading byte; this makes all bytes unsigned (and
|
|
does not alter ASCII char).
|
|
|
|
|00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
|
|
|00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
|
|
|00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
|
|
|00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
|
|
^^ ^^ ^^ ^
|
|
*/
|
|
__m512i values;
|
|
const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
|
|
values = _mm512_and_si512(utf8, v_3f3f_3f7f);
|
|
|
|
/* 2. Swap and join fields A-B and C-D
|
|
|
|
|0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
|
|
|0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
|
|
|0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
|
|
|0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
|
|
const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
|
|
values = _mm512_maddubs_epi16(values, v_0140_0140);
|
|
|
|
/* 3. Swap and join fields AB & CD
|
|
|
|
|0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
|
|
|0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
|
|
|0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
|
|
|0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
|
|
const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
|
|
values = _mm512_madd_epi16(values, v_0001_1000);
|
|
|
|
/* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
|
|
|aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
|
|
|aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
|
|
|aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
|
|
|aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
|
|
{
|
|
/** pshufb
|
|
|
|
continuation = 0
|
|
ascii = 7
|
|
_2_bytes = 9
|
|
_3_bytes = 10
|
|
_4_bytes = 11
|
|
|
|
shift_left_v3 = 4 * [
|
|
ascii, # 0000
|
|
ascii, # 0001
|
|
ascii, # 0010
|
|
ascii, # 0011
|
|
ascii, # 0100
|
|
ascii, # 0101
|
|
ascii, # 0110
|
|
ascii, # 0111
|
|
continuation, # 1000
|
|
continuation, # 1001
|
|
continuation, # 1010
|
|
continuation, # 1011
|
|
_2_bytes, # 1100
|
|
_2_bytes, # 1101
|
|
_3_bytes, # 1110
|
|
_4_bytes, # 1111
|
|
] */
|
|
const __m512i shift_left_v3 = _mm512_setr_epi64(
|
|
0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707,
|
|
0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000,
|
|
0x0707070707070707, 0x0b0a090900000000);
|
|
|
|
const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
|
|
values = _mm512_sllv_epi32(values, shift);
|
|
}
|
|
|
|
/* 5. Shift right the values by variable amounts to reset lowest bits
|
|
|0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
|
|
|0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
|
|
|0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
|
|
|0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
|
|
{
|
|
// 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
|
|
const __m512i shift_right = _mm512_setr_epi64(
|
|
0x1919191919191919, 0x0b10151500000000, 0x1919191919191919,
|
|
0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000,
|
|
0x1919191919191919, 0x0b10151500000000);
|
|
|
|
const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
|
|
values = _mm512_srlv_epi32(values, shift);
|
|
}
|
|
|
|
return values;
|
|
}
|
|
|
|
simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1,
|
|
int &count) {
|
|
const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
|
|
const __m512i expand_ver2 = _mm512_setr_epi64(
|
|
0x0403020103020100, 0x0605040305040302, 0x0807060507060504,
|
|
0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,
|
|
0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);
|
|
const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
|
|
const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
|
|
const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
|
|
const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
|
|
const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
|
|
count = static_cast<int>(count_ones(leading_bytes));
|
|
return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes,
|
|
input);
|
|
}
|
|
|
|
simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
|
|
__m512i char_class = _mm512_srli_epi32(input, 4);
|
|
/* char_class = ((input >> 4) & 0x0f) | 0x80808000 */
|
|
const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
|
|
const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
|
|
char_class =
|
|
_mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
|
|
return expanded_utf8_to_utf32(char_class, input);
|
|
}
|
|
/* end file src/icelake/icelake_utf8_common.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/icelake/icelake_utf8_validation.inl.cpp */
|
|
// file included directly
|
|
|
|
simdutf_really_inline __m512i check_special_cases(__m512i input,
|
|
const __m512i prev1) {
|
|
__m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
|
|
0x0202020202020202, 0x4915012180808080,
|
|
0x0202020202020202, 0x4915012180808080,
|
|
0x0202020202020202, 0x4915012180808080);
|
|
const __m512i v_0f = _mm512_set1_epi8(0x0f);
|
|
__m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
|
|
|
|
__m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
|
|
__m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
|
|
0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
|
|
0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
|
|
0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
|
|
__m512i index2 = _mm512_and_si512(prev1, v_0f);
|
|
|
|
__m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
|
|
__m512i mask3 =
|
|
_mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
|
|
0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
|
|
0x101010101010101, 0x1010101babaaee6);
|
|
__m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
|
|
__m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
|
|
return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
|
|
}
|
|
|
|
simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
|
|
const __m512i prev_input,
|
|
const __m512i sc) {
|
|
__m512i prev2 = prev<2>(input, prev_input);
|
|
__m512i prev3 = prev<3>(input, prev_input);
|
|
__m512i is_third_byte = _mm512_subs_epu8(
|
|
prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
|
|
__m512i is_fourth_byte = _mm512_subs_epu8(
|
|
prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
|
|
__m512i is_third_or_fourth_byte =
|
|
_mm512_or_si512(is_third_byte, is_fourth_byte);
|
|
const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
|
|
is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
|
|
// We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
|
|
0b1101010);
|
|
//__m512i is_third_or_fourth_byte_mask =
|
|
//_mm512_and_si512(is_third_or_fourth_byte, v_80); return
|
|
// _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
|
|
}
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the
|
|
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline __m512i is_incomplete(const __m512i input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they
|
|
// ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
__m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
|
|
0xffffffffffffffff, 0xffffffffffffffff,
|
|
0xffffffffffffffff, 0xffffffffffffffff,
|
|
0xffffffffffffffff, 0xbfdfefffffffffff);
|
|
return _mm512_subs_epu8(input, max_value);
|
|
}
|
|
|
|
struct avx512_utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
__m512i error{};
|
|
|
|
// The last input we received
|
|
__m512i prev_input_block{};
|
|
// Whether the last input we received was incomplete (used for ASCII fast
|
|
// path)
|
|
__m512i prev_incomplete{};
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const __m512i input,
|
|
const __m512i prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
__m512i prev1 = prev<1>(input, prev_input);
|
|
__m512i sc = check_special_cases(input, prev1);
|
|
this->error = _mm512_or_si512(
|
|
check_multibyte_lengths(input, prev_input, sc), this->error);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is
|
|
// too short or a byte value too large in the last bytes: check_special_cases
|
|
// only checks for bytes too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an
|
|
// ASCII block can't possibly finish them.
|
|
this->error = _mm512_or_si512(this->error, this->prev_incomplete);
|
|
}
|
|
|
|
// returns true if ASCII.
|
|
simdutf_really_inline bool check_next_input(const __m512i input) {
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
|
|
if (ascii == 0) {
|
|
this->error = _mm512_or_si512(this->error, this->prev_incomplete);
|
|
return true;
|
|
} else {
|
|
this->check_utf8_bytes(input, this->prev_input_block);
|
|
this->prev_incomplete = is_incomplete(input);
|
|
this->prev_input_block = input;
|
|
return false;
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return _mm512_test_epi8_mask(this->error, this->error) != 0;
|
|
}
|
|
}; // struct avx512_utf8_checker
|
|
/* end file src/icelake/icelake_utf8_validation.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && \
|
|
(SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1)
|
|
/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
// File contains conversion procedure from VALID UTF-8 strings.
|
|
|
|
/*
|
|
valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
|
|
|
|
The `OUTPUT` template type decides what to do with UTF-32: store
|
|
it directly or convert into UTF-16 (with AVX512).
|
|
|
|
Input:
|
|
- str - valid UTF-8 string
|
|
- len - string length
|
|
- out_buffer - output buffer
|
|
|
|
Result:
|
|
- pair.first - the first unprocessed input byte
|
|
- pair.second - the first unprocessed output word
|
|
*/
|
|
template <endianness big_endian, typename OUTPUT>
|
|
std::pair<const char *, OUTPUT *>
|
|
valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
|
|
constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
|
|
constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
|
|
static_assert(
|
|
UTF32 or UTF16,
|
|
"output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
|
|
static_assert(!(UTF32 and big_endian),
|
|
"we do not currently support big-endian UTF-32");
|
|
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
const char *ptr = str;
|
|
const char *end = ptr + len;
|
|
|
|
OUTPUT *output = dwords;
|
|
/**
|
|
* In the main loop, we consume 64 bytes per iteration,
|
|
* but we access 64 + 4 bytes.
|
|
* We check for ptr + 64 + 64 <= end because
|
|
* we want to be do maskless writes without overruns.
|
|
*/
|
|
while (end - ptr >= 64 + 4) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
|
|
if (ascii == 0) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
continue;
|
|
}
|
|
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if (valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(
|
|
vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
int valid_count2;
|
|
__m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
|
|
uint32_t tmp1;
|
|
::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
|
|
const __m512i lane4 = _mm512_set1_epi32(tmp1);
|
|
int valid_count3;
|
|
__m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
|
|
if (valid_count2 + valid_count3 <= 16) {
|
|
vec2 = _mm512_mask_expand_epi32(
|
|
vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
|
|
valid_count2 += valid_count3;
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
|
|
} else {
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
vec3 = expand_utf8_to_utf32(vec3);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
|
|
}
|
|
ptr += 4 * 16;
|
|
}
|
|
|
|
if (end - ptr >= 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
|
|
if (ascii == 0) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
} else {
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if (valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(
|
|
vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
|
|
|
|
ptr += 3 * 16;
|
|
}
|
|
}
|
|
return {ptr, output};
|
|
}
|
|
|
|
using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
|
|
/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
|
|
/* begin file src/icelake/icelake_from_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
// File contains conversion procedure from possibly invalid UTF-8 strings.
|
|
|
|
template <endianness big_endian, typename OUTPUT>
|
|
// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code
|
|
// is legacy.
|
|
std::pair<const char *, OUTPUT *>
|
|
validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
|
|
constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
|
|
constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
|
|
static_assert(
|
|
UTF32 or UTF16,
|
|
"output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
|
|
static_assert(!(UTF32 and big_endian),
|
|
"we do not currently support big-endian UTF-32");
|
|
|
|
const char *ptr = str;
|
|
const char *end = ptr + len;
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
OUTPUT *output = dwords;
|
|
avx512_utf8_checker checker{};
|
|
/**
|
|
* In the main loop, we consume 64 bytes per iteration,
|
|
* but we access 64 + 4 bytes.
|
|
* We use masked writes to avoid overruns, see
|
|
* https://github.com/simdutf/simdutf/issues/471
|
|
*/
|
|
while (end - ptr >= 64 + 4) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
if (checker.check_next_input(utf8)) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
continue;
|
|
}
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if (valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(
|
|
vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
int valid_count2;
|
|
__m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
|
|
uint32_t tmp1;
|
|
::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
|
|
const __m512i lane4 = _mm512_set1_epi32(tmp1);
|
|
int valid_count3;
|
|
__m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
|
|
if (valid_count2 + valid_count3 <= 16) {
|
|
vec2 = _mm512_mask_expand_epi32(
|
|
vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
|
|
valid_count2 += valid_count3;
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
|
|
} else {
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
vec3 = expand_utf8_to_utf32(vec3);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
|
|
}
|
|
ptr += 4 * 16;
|
|
}
|
|
const char *validatedptr = ptr; // validated up to ptr
|
|
|
|
// For the final pass, we validate 64 bytes, but we only transcode
|
|
// 3*16 bytes, so we may end up double-validating 16 bytes.
|
|
if (end - ptr >= 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
if (checker.check_next_input(utf8)) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
} else {
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if (valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(
|
|
vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
|
|
|
|
ptr += 3 * 16;
|
|
}
|
|
validatedptr += 4 * 16;
|
|
}
|
|
if (end != validatedptr) {
|
|
const __m512i utf8 =
|
|
_mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
|
|
(const __m512i *)validatedptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
if (checker.errors()) {
|
|
return {ptr, nullptr}; // We found an error.
|
|
}
|
|
return {ptr, output};
|
|
}
|
|
|
|
// Like validating_utf8_to_fixed_length but returns as soon as an error is
|
|
// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32.
|
|
// This code is legacy.
|
|
template <endianness big_endian, typename OUTPUT>
|
|
std::tuple<const char *, OUTPUT *, bool>
|
|
validating_utf8_to_fixed_length_with_constant_checks(const char *str,
|
|
size_t len,
|
|
OUTPUT *dwords) {
|
|
constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
|
|
constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
|
|
static_assert(
|
|
UTF32 or UTF16,
|
|
"output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
|
|
static_assert(!(UTF32 and big_endian),
|
|
"we do not currently support big-endian UTF-32");
|
|
|
|
const char *ptr = str;
|
|
const char *end = ptr + len;
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
OUTPUT *output = dwords;
|
|
avx512_utf8_checker checker{};
|
|
/**
|
|
* In the main loop, we consume 64 bytes per iteration,
|
|
* but we access 64 + 4 bytes.
|
|
*/
|
|
while (end - ptr >= 4 + 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
bool ascii = checker.check_next_input(utf8);
|
|
if (checker.errors()) {
|
|
return {ptr, output, false}; // We found an error.
|
|
}
|
|
if (ascii) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
continue;
|
|
}
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if (valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(
|
|
vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
int valid_count2;
|
|
__m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
|
|
uint32_t tmp1;
|
|
::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
|
|
const __m512i lane4 = _mm512_set1_epi32(tmp1);
|
|
int valid_count3;
|
|
__m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
|
|
if (valid_count2 + valid_count3 <= 16) {
|
|
vec2 = _mm512_mask_expand_epi32(
|
|
vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
|
|
valid_count2 += valid_count3;
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
|
|
} else {
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
vec3 = expand_utf8_to_utf32(vec3);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
|
|
}
|
|
ptr += 4 * 16;
|
|
}
|
|
const char *validatedptr = ptr; // validated up to ptr
|
|
|
|
// For the final pass, we validate 64 bytes, but we only transcode
|
|
// 3*16 bytes, so we may end up double-validating 16 bytes.
|
|
if (end - ptr >= 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
bool ascii = checker.check_next_input(utf8);
|
|
if (checker.errors()) {
|
|
return {ptr, output, false}; // We found an error.
|
|
}
|
|
if (ascii) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
} else {
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if (valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(
|
|
vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
|
|
|
|
ptr += 3 * 16;
|
|
}
|
|
validatedptr += 4 * 16;
|
|
}
|
|
if (end != validatedptr) {
|
|
const __m512i utf8 =
|
|
_mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
|
|
(const __m512i *)validatedptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
if (checker.errors()) {
|
|
return {ptr, output, false}; // We found an error.
|
|
}
|
|
return {ptr, output, true};
|
|
}
|
|
/* end file src/icelake/icelake_from_utf8.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 ||
|
|
// SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_LATIN1)
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/icelake/icelake_utf16fix.cpp */
|
|
/*
|
|
* Process one block of 32 characters. If in_place is false,
|
|
* copy the block from in to out. If there is a sequencing
|
|
* error in the block, overwrite the illsequenced characters
|
|
* with the replacement character. This function reads one
|
|
* character before the beginning of the buffer as a lookback.
|
|
* If that character is illsequenced, it too is overwritten.
|
|
*/
|
|
template <endianness big_endian, bool in_place>
|
|
simdutf_really_inline void utf16fix_block(char16_t *out, const char16_t *in) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
auto swap_if_needed = [](uint16_t c) -> uint16_t {
|
|
return !simdutf::match_system(big_endian) ? scalar::u16_swap_bytes(c) : c;
|
|
};
|
|
|
|
__m512i lookback, block, lb_masked, block_masked;
|
|
__mmask32 lb_is_high, block_is_low, illseq;
|
|
|
|
lookback = _mm512_loadu_si512((const __m512i *)(in - 1));
|
|
block = _mm512_loadu_si512((const __m512i *)in);
|
|
lb_masked =
|
|
_mm512_and_epi32(lookback, _mm512_set1_epi16(swap_if_needed(0xfc00U)));
|
|
block_masked =
|
|
_mm512_and_epi32(block, _mm512_set1_epi16(swap_if_needed(0xfc00U)));
|
|
|
|
lb_is_high = _mm512_cmpeq_epi16_mask(
|
|
lb_masked, _mm512_set1_epi16(swap_if_needed(0xd800U)));
|
|
block_is_low = _mm512_cmpeq_epi16_mask(
|
|
block_masked, _mm512_set1_epi16(swap_if_needed(0xdc00U)));
|
|
illseq = _kxor_mask32(lb_is_high, block_is_low);
|
|
if (!_ktestz_mask32_u8(illseq, illseq)) {
|
|
__mmask32 lb_illseq, block_illseq;
|
|
|
|
/* compute the cause of the illegal sequencing */
|
|
lb_illseq = _kandn_mask32(block_is_low, lb_is_high);
|
|
block_illseq = _kor_mask32(_kandn_mask32(lb_is_high, block_is_low),
|
|
_kshiftri_mask32(lb_illseq, 1));
|
|
|
|
/* fix illegal sequencing in the lookback */
|
|
lb_illseq = _kand_mask32(lb_illseq, _cvtu32_mask32(1));
|
|
_mm512_mask_storeu_epi16(out - 1, lb_illseq,
|
|
_mm512_set1_epi16(replacement));
|
|
|
|
/* fix illegal sequencing in the main block */
|
|
if (in_place) {
|
|
_mm512_mask_storeu_epi16(out, block_illseq,
|
|
_mm512_set1_epi16(replacement));
|
|
} else {
|
|
_mm512_storeu_epi32(
|
|
out, _mm512_mask_blend_epi16(block_illseq, block,
|
|
_mm512_set1_epi16(replacement)));
|
|
}
|
|
} else if (!in_place) {
|
|
_mm512_storeu_si512((__m512i *)out, block);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Special case for inputs of 0--32 bytes. Works for both in-place and
|
|
* out-of-place operation.
|
|
*/
|
|
template <endianness big_endian>
|
|
void utf16fix_runt(const char16_t *in, size_t n, char16_t *out) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
auto swap_if_needed = [](uint16_t c) -> uint16_t {
|
|
return !simdutf::match_system(big_endian) ? scalar::u16_swap_bytes(c) : c;
|
|
};
|
|
__m512i lookback, block, lb_masked, block_masked;
|
|
__mmask32 lb_is_high, block_is_low, illseq;
|
|
uint32_t mask = 0xFFFFFFFF >> (32 - n);
|
|
lookback = _mm512_maskz_loadu_epi16(_cvtmask32_u32(mask << 1),
|
|
(const uint16_t *)(in - 1));
|
|
block = _mm512_maskz_loadu_epi16(_cvtmask32_u32(mask), (const uint16_t *)in);
|
|
lb_masked =
|
|
_mm512_and_epi32(lookback, _mm512_set1_epi16(swap_if_needed(0xfc00u)));
|
|
block_masked =
|
|
_mm512_and_epi32(block, _mm512_set1_epi16(swap_if_needed(0xfc00u)));
|
|
|
|
lb_is_high = _mm512_cmpeq_epi16_mask(
|
|
lb_masked, _mm512_set1_epi16(swap_if_needed(0xd800u)));
|
|
block_is_low = _mm512_cmpeq_epi16_mask(
|
|
block_masked, _mm512_set1_epi16(swap_if_needed(0xdc00u)));
|
|
illseq = _kxor_mask32(lb_is_high, block_is_low);
|
|
if (!_ktestz_mask32_u8(illseq, illseq)) {
|
|
__mmask32 lb_illseq, block_illseq;
|
|
|
|
/* compute the cause of the illegal sequencing */
|
|
lb_illseq = _kandn_mask32(block_is_low, lb_is_high);
|
|
block_illseq = _kor_mask32(_kandn_mask32(lb_is_high, block_is_low),
|
|
_kshiftri_mask32(lb_illseq, 1));
|
|
|
|
/* fix illegal sequencing in the main block */
|
|
_mm512_mask_storeu_epi16(
|
|
(uint16_t *)out, _cvtmask32_u32(mask),
|
|
_mm512_mask_blend_epi16(block_illseq, block,
|
|
_mm512_set1_epi16(replacement)));
|
|
} else {
|
|
_mm512_mask_storeu_epi16((uint16_t *)out, _cvtmask32_u32(mask), block);
|
|
}
|
|
out[n - 1] = scalar::utf16::is_high_surrogate<big_endian>(out[n - 1])
|
|
? replacement
|
|
: out[n - 1];
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
void utf16fix_avx512(const char16_t *in, size_t n, char16_t *out) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
size_t i;
|
|
|
|
if (n == 0)
|
|
return;
|
|
else if (n < 33) {
|
|
utf16fix_runt<big_endian>(in, n, out);
|
|
return;
|
|
}
|
|
out[0] =
|
|
scalar::utf16::is_low_surrogate<big_endian>(in[0]) ? replacement : in[0];
|
|
|
|
/* duplicate code to have the compiler specialise utf16fix_block() */
|
|
if (in == out) {
|
|
for (i = 1; i + 32 < n; i += 32) {
|
|
utf16fix_block<big_endian, true>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block<big_endian, true>(out + n - 32, in + n - 32);
|
|
} else {
|
|
for (i = 1; i + 32 < n; i += 32) {
|
|
utf16fix_block<big_endian, false>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block<big_endian, false>(out + n - 32, in + n - 32);
|
|
}
|
|
|
|
out[n - 1] = scalar::utf16::is_high_surrogate<big_endian>(out[n - 1])
|
|
? replacement
|
|
: out[n - 1];
|
|
}
|
|
/* end file src/icelake/icelake_utf16fix.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
|
|
// file included directly
|
|
|
|
// File contains conversion procedure from possibly invalid UTF-8 strings.
|
|
|
|
template <bool is_remaining>
|
|
simdutf_really_inline size_t process_block_from_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin_output, __m512i minus64,
|
|
__m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
|
|
__mmask64 load_mask =
|
|
is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
|
|
__m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
|
|
__mmask64 nonascii = _mm512_movepi8_mask(input);
|
|
if (nonascii == 0) {
|
|
if (*next_leading_ptr) { // If we ended with a leading byte, it is an error.
|
|
return 0; // Indicates error
|
|
}
|
|
is_remaining
|
|
? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
|
|
: _mm512_storeu_si512((__m512i *)latin_output, input);
|
|
return len;
|
|
}
|
|
|
|
const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
|
|
|
|
__m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
|
|
__mmask64 invalid_leading_bytes =
|
|
_mm512_mask_cmpgt_epu8_mask(leading, highbits, one);
|
|
|
|
if (invalid_leading_bytes) {
|
|
return 0; // Indicates error
|
|
}
|
|
|
|
__mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
|
|
|
|
if ((nonascii ^ leading) != leading_shift) {
|
|
return 0; // Indicates error
|
|
}
|
|
|
|
const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
|
|
input =
|
|
_mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
|
|
|
|
__mmask64 retain = ~leading & load_mask;
|
|
__m512i output = _mm512_maskz_compress_epi8(retain, input);
|
|
int64_t written_out = count_ones(retain);
|
|
if (written_out == 0) {
|
|
return 0; // Indicates error
|
|
}
|
|
*next_bit6_ptr = bit6 >> 63;
|
|
*next_leading_ptr = leading >> 63;
|
|
|
|
__mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
|
|
|
|
_mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
|
|
|
|
return written_out;
|
|
}
|
|
|
|
size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len,
|
|
char *&inlatin_output) {
|
|
const char *buf = inbuf;
|
|
char *latin_output = inlatin_output;
|
|
char *start = latin_output;
|
|
size_t pos = 0;
|
|
__m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
|
|
__m512i one = _mm512_set1_epi8(1);
|
|
__mmask64 next_leading = 0;
|
|
__mmask64 next_bit6 = 0;
|
|
|
|
while (pos + 64 <= len) {
|
|
size_t written = process_block_from_utf8_to_latin1<false>(
|
|
buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
|
|
if (written == 0) {
|
|
inlatin_output = latin_output;
|
|
inbuf = buf + pos - next_leading;
|
|
return 0; // Indicates error at pos or after, or just before pos (too
|
|
// short error)
|
|
}
|
|
latin_output += written;
|
|
pos += 64;
|
|
}
|
|
|
|
if (pos < len) {
|
|
size_t remaining = len - pos;
|
|
size_t written = process_block_from_utf8_to_latin1<true>(
|
|
buf + pos, remaining, latin_output, minus64, one, &next_leading,
|
|
&next_bit6);
|
|
if (written == 0) {
|
|
inbuf = buf + pos - next_leading;
|
|
inlatin_output = latin_output;
|
|
return 0; // Indicates error at pos or after, or just before pos (too
|
|
// short error)
|
|
}
|
|
latin_output += written;
|
|
}
|
|
if (next_leading) {
|
|
inbuf = buf + len - next_leading;
|
|
inlatin_output = latin_output;
|
|
return 0; // Indicates error at end of buffer
|
|
}
|
|
inlatin_output = latin_output;
|
|
inbuf += len;
|
|
return size_t(latin_output - start);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
|
|
/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
|
|
// file included directly
|
|
|
|
// File contains conversion procedure from valid UTF-8 strings.
|
|
|
|
template <bool is_remaining>
|
|
simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin_output, __m512i minus64,
|
|
__m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
|
|
__mmask64 load_mask =
|
|
is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
|
|
__m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
|
|
__mmask64 nonascii = _mm512_movepi8_mask(input);
|
|
|
|
if (nonascii == 0) {
|
|
is_remaining
|
|
? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
|
|
: _mm512_storeu_si512((__m512i *)latin_output, input);
|
|
return len;
|
|
}
|
|
|
|
__mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
|
|
|
|
__m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
|
|
|
|
*next_leading_ptr = leading >> 63;
|
|
|
|
__mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
|
|
input =
|
|
_mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
|
|
*next_bit6_ptr = bit6 >> 63;
|
|
|
|
__mmask64 retain = ~leading & load_mask;
|
|
__m512i output = _mm512_maskz_compress_epi8(retain, input);
|
|
int64_t written_out = count_ones(retain);
|
|
if (written_out == 0) {
|
|
return 0; // Indicates error
|
|
}
|
|
__mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
|
|
// Optimization opportunity: sometimes, masked writes are not needed.
|
|
_mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
|
|
return written_out;
|
|
}
|
|
|
|
size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
|
|
char *latin_output) {
|
|
char *start = latin_output;
|
|
size_t pos = 0;
|
|
__m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
|
|
__m512i one = _mm512_set1_epi8(1);
|
|
__mmask64 next_leading = 0;
|
|
__mmask64 next_bit6 = 0;
|
|
|
|
while (pos + 64 <= len) {
|
|
size_t written = process_valid_block_from_utf8_to_latin1<false>(
|
|
buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
|
|
latin_output += written;
|
|
pos += 64;
|
|
}
|
|
|
|
if (pos < len) {
|
|
size_t remaining = len - pos;
|
|
size_t written = process_valid_block_from_utf8_to_latin1<true>(
|
|
buf + pos, remaining, latin_output, minus64, one, &next_leading,
|
|
&next_bit6);
|
|
latin_output += written;
|
|
}
|
|
|
|
return (size_t)(latin_output - start);
|
|
}
|
|
/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
|
|
// file included directly
|
|
template <endianness big_endian>
|
|
size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
__m512i v_0xFF = _mm512_set1_epi16(0xff);
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
__m512i shufmask = _mm512_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
|
|
36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
|
|
while (end - buf >= 32) {
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
if (big_endian) {
|
|
in = _mm512_shuffle_epi8(in, byteflip);
|
|
}
|
|
if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
|
|
return 0;
|
|
}
|
|
_mm256_storeu_si256(
|
|
(__m256i *)latin1_output,
|
|
_mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
|
|
latin1_output += 32;
|
|
buf += 32;
|
|
}
|
|
if (buf < end) {
|
|
uint32_t mask(uint32_t(1 << (end - buf)) - 1);
|
|
__m512i in = _mm512_maskz_loadu_epi16(mask, buf);
|
|
if (big_endian) {
|
|
in = _mm512_shuffle_epi8(in, byteflip);
|
|
}
|
|
if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
|
|
return 0;
|
|
}
|
|
_mm256_mask_storeu_epi8(
|
|
latin1_output, mask,
|
|
_mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
|
|
}
|
|
return len;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
const char16_t *start = buf;
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
__m512i v_0xFF = _mm512_set1_epi16(0xff);
|
|
__m512i shufmask = _mm512_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
|
|
36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
|
|
while (end - buf >= 32) {
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
if (big_endian) {
|
|
in = _mm512_shuffle_epi8(in, byteflip);
|
|
}
|
|
if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
|
|
uint16_t word;
|
|
while ((word = (big_endian ? scalar::u16_swap_bytes(uint16_t(*buf))
|
|
: uint16_t(*buf))) <= 0xff) {
|
|
*latin1_output++ = uint8_t(word);
|
|
buf++;
|
|
}
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
latin1_output);
|
|
}
|
|
_mm256_storeu_si256(
|
|
(__m256i *)latin1_output,
|
|
_mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
|
|
latin1_output += 32;
|
|
buf += 32;
|
|
}
|
|
if (buf < end) {
|
|
uint32_t mask(uint32_t(1 << (end - buf)) - 1);
|
|
__m512i in = _mm512_maskz_loadu_epi16(mask, buf);
|
|
if (big_endian) {
|
|
in = _mm512_shuffle_epi8(in, byteflip);
|
|
}
|
|
if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
|
|
|
|
uint16_t word;
|
|
while ((word = (big_endian ? scalar::u16_swap_bytes(uint16_t(*buf))
|
|
: uint16_t(*buf))) <= 0xff) {
|
|
*latin1_output++ = uint8_t(word);
|
|
buf++;
|
|
}
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
latin1_output);
|
|
}
|
|
_mm256_mask_storeu_epi8(
|
|
latin1_output, mask,
|
|
_mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
|
|
}
|
|
return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
/**
|
|
* This function converts the input (inbuf, inlen), assumed to be valid
|
|
* UTF16 (little endian) into UTF-8 (to outbuf). The number of code units
|
|
* written is written to 'outlen' and the function reports the number of input
|
|
* word consumed.
|
|
*/
|
|
template <endianness big_endian>
|
|
size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
|
|
unsigned char *outbuf, size_t *outlen) {
|
|
__m512i in;
|
|
__mmask32 inmask = _cvtu32_mask32(0x7fffffff);
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
const char16_t *const inbuf_orig = inbuf;
|
|
const unsigned char *const outbuf_orig = outbuf;
|
|
int adjust = 0;
|
|
int carry = 0;
|
|
|
|
while (inlen >= 32) {
|
|
in = _mm512_loadu_si512(inbuf);
|
|
if (big_endian) {
|
|
in = _mm512_shuffle_epi8(in, byteflip);
|
|
}
|
|
inlen -= 31;
|
|
lastiteration:
|
|
inbuf += 31;
|
|
|
|
failiteration:
|
|
const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
|
|
inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
|
|
|
|
if (_ktestz_mask32_u8(inmask, is234byte)) {
|
|
// fast path for ASCII only
|
|
_mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
|
|
outbuf += 31;
|
|
carry = 0;
|
|
|
|
if (inlen < 32) {
|
|
goto tail;
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
const __mmask32 is12byte =
|
|
_mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
|
|
|
|
if (_ktestc_mask32_u8(is12byte, inmask)) {
|
|
// fast path for 1 and 2 byte only
|
|
|
|
const __m512i twobytes = _mm512_ternarylogic_epi32(
|
|
_mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
|
|
_mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
|
|
in = _mm512_mask_add_epi16(in, is234byte, twobytes,
|
|
_mm512_set1_epi16(int16_t(0x80c0)));
|
|
const __m512i cmpmask =
|
|
_mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
|
|
_mm512_set1_epi16(0x0800));
|
|
const __mmask64 smoosh =
|
|
_mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
|
|
const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
|
|
_mm512_mask_storeu_epi8(outbuf,
|
|
_cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh),
|
|
_cvtmask64_u64(smoosh))),
|
|
out);
|
|
outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
|
|
carry = 0;
|
|
|
|
if (inlen < 32) {
|
|
goto tail;
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
__m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
|
|
__m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
|
|
|
|
__m512i taglo = _mm512_set1_epi32(0x8080e000);
|
|
__m512i taghi = taglo;
|
|
|
|
const __m512i fc00masked =
|
|
_mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
|
|
const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
|
|
inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
|
|
const __mmask32 losurr = _mm512_cmp_epu16_mask(
|
|
fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
|
|
|
|
int carryout = 0;
|
|
if (!_kortestz_mask32_u8(hisurr, losurr)) {
|
|
// handle surrogates
|
|
|
|
__m512i los = _mm512_alignr_epi32(hi, lo, 1);
|
|
__m512i his = _mm512_alignr_epi32(lo, hi, 1);
|
|
|
|
const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
|
|
taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr),
|
|
_mm512_set1_epi32(0x808080f0));
|
|
taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi),
|
|
_mm512_set1_epi32(0x808080f0));
|
|
|
|
lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
|
|
hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
|
|
los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
|
|
his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
|
|
lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
|
|
hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
|
|
|
|
carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
|
|
|
|
const uint32_t h = _cvtmask32_u32(hisurr);
|
|
const uint32_t l = _cvtmask32_u32(losurr);
|
|
// check for mismatched surrogates
|
|
if ((h + h + carry) ^ l) {
|
|
const uint32_t lonohi = l & ~(h + h + carry);
|
|
const uint32_t hinolo = h & ~(l >> 1);
|
|
inlen = _tzcnt_u32(hinolo | lonohi);
|
|
inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1));
|
|
in = _mm512_maskz_mov_epi16(inmask, in);
|
|
adjust = (int)inlen - 31;
|
|
inlen = 0;
|
|
goto failiteration;
|
|
}
|
|
}
|
|
|
|
hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
|
|
carry = carryout;
|
|
|
|
__m512i mslo =
|
|
_mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
|
|
|
|
__m512i mshi =
|
|
_mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
|
|
|
|
const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
|
|
const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
|
|
|
|
const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
|
|
const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
|
|
const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
|
|
|
|
taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte),
|
|
_mm512_set1_epi32(0x80c00000));
|
|
taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi),
|
|
_mm512_set1_epi32(0x80c00000));
|
|
__m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
|
|
_mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
__m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
|
|
_mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
|
|
magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
|
|
_mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
|
|
_mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
|
|
mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
|
|
0xea); // A&B|C
|
|
mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
|
|
0xea);
|
|
mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
|
|
|
|
mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
|
|
|
|
const __mmask64 wantlo =
|
|
_mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
|
|
const __mmask64 wanthi =
|
|
_mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
|
|
const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
|
|
const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
|
|
const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
|
|
const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
|
|
|
|
uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
|
|
uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
|
|
|
|
_mm512_mask_storeu_epi8(
|
|
outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
|
|
_mm512_mask_storeu_epi8(
|
|
outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)),
|
|
outhi);
|
|
outbuf += advlo + advhi;
|
|
}
|
|
outbuf += -adjust;
|
|
|
|
tail:
|
|
if (inlen != 0) {
|
|
// We must have inlen < 31.
|
|
inmask = _cvtu32_mask32((1U << inlen) - 1);
|
|
in = _mm512_maskz_loadu_epi16(inmask, inbuf);
|
|
if (big_endian) {
|
|
in = _mm512_shuffle_epi8(in, byteflip);
|
|
}
|
|
adjust = (int)inlen - 31;
|
|
inlen = 0;
|
|
goto lastiteration;
|
|
}
|
|
*outlen = (outbuf - outbuf_orig) + adjust;
|
|
return ((inbuf - inbuf_orig) + adjust);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
|
|
/* begin file src/icelake/icelake_convert_utf8_to_utf16.inl.cpp */
|
|
// file included directly
|
|
|
|
// File contains conversion procedure from possibly invalid UTF-8 strings.
|
|
|
|
/**
|
|
* Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to
|
|
* out.
|
|
* Returns the position of the input and output after the processing is
|
|
* completed. Upon error, the output is set to null.
|
|
*/
|
|
|
|
template <endianness big_endian>
|
|
utf8_to_utf16_result
|
|
fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
|
|
const char *const final_in = in + len;
|
|
bool result = true;
|
|
while (result) {
|
|
if (final_in - in >= 64) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
|
|
in, out, final_in - in);
|
|
} else if (in < final_in) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
|
|
in, out, final_in - in);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if (!result) {
|
|
out = nullptr;
|
|
}
|
|
return std::make_pair(in, out);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in,
|
|
size_t len,
|
|
char16_t *out) {
|
|
const char *const init_in = in;
|
|
const char16_t *const init_out = out;
|
|
const char *const final_in = in + len;
|
|
bool result = true;
|
|
while (result) {
|
|
if (final_in - in >= 64) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
|
|
in, out, final_in - in);
|
|
} else if (in < final_in) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
|
|
in, out, final_in - in);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if (!result) {
|
|
size_t pos = size_t(in - init_in);
|
|
if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) {
|
|
// We must check whether we are the fourth continuation byte
|
|
bool c1 = (init_in[pos - 1] & 0xc0) == 0x80;
|
|
bool c2 = (init_in[pos - 2] & 0xc0) == 0x80;
|
|
bool c3 = (init_in[pos - 3] & 0xc0) == 0x80;
|
|
if (c1 && c2 && c3) {
|
|
return {simdutf::TOO_LONG, pos};
|
|
}
|
|
}
|
|
// rewind_and_convert_with_errors will seek a potential error from in
|
|
// onward, with the ability to go back up to in - init_in bytes, and read
|
|
// final_in - in bytes forward.
|
|
simdutf::result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(
|
|
in - init_in, in, final_in - in, out);
|
|
res.count += (in - init_in);
|
|
return res;
|
|
} else {
|
|
return simdutf::result(error_code::SUCCESS, out - init_out);
|
|
}
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf8_to_utf16.inl.cpp */
|
|
/* begin file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */
|
|
// This is translation of `utf8_length_from_utf16_bytemask` from
|
|
// `generic/utf16.h`
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t icelake_utf8_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
|
|
using vector_u16 = simd16<uint16_t>;
|
|
constexpr size_t N = vector_u16::ELEMENTS;
|
|
|
|
const auto one = vector_u16::splat(1);
|
|
|
|
auto v_count = vector_u16::zero();
|
|
|
|
// each char16 yields at least one byte
|
|
size_t count = size / N * N;
|
|
|
|
// in a single iteration the increment is 0, 1 or 2, despite we have
|
|
// three additions
|
|
constexpr size_t max_iterations = 65535 / 2;
|
|
size_t iteration = max_iterations;
|
|
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vector_u16::load(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input = input.swap_bytes();
|
|
}
|
|
|
|
// not_surrogate[i] = non-zero if i-th element is not a surrogate word
|
|
const auto not_surrogate = (input & uint16_t(0xf800)) ^ uint16_t(0xd800);
|
|
|
|
// not_surrogate[i] = 1 if surrogate word, 0 otherwise
|
|
const auto is_surrogate = min(not_surrogate, one) ^ one;
|
|
|
|
// c0 - chars that yield 2- or 3-byte UTF-8 codes
|
|
const auto c0 = min(input & uint16_t(0xff80), one);
|
|
|
|
// c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
|
|
const auto c1 = min(input & uint16_t(0xf800), one);
|
|
|
|
/*
|
|
Explanation how the counting works.
|
|
|
|
In the case of a non-surrogate character we count:
|
|
* always 1 -- see how `count` is initialized above;
|
|
* c0 = 1 if the current char yields 2 or 3 bytes;
|
|
* c1 = 1 if the current char yields 3 bytes.
|
|
|
|
Thus, we always have correct count for the current char:
|
|
from 1, 2 or 3 bytes.
|
|
|
|
A trickier part is how we count surrogate pairs. Whether
|
|
we encounter a surrogate (low or high), we count it as
|
|
3 chars and then minus 1 (`is_surrogate` is -1 or 0).
|
|
Each surrogate char yields 2. A surrogate pair, that
|
|
is a low surrogate followed by a high one, yields
|
|
the expected 4 bytes.
|
|
|
|
It also correctly handles cases when low surrogate is
|
|
processed by the this loop, but high surrogate is counted
|
|
by the scalar procedure. The scalar procedure uses exactly
|
|
the described approach, thanks to that for valid UTF-16
|
|
strings it always count correctly.
|
|
*/
|
|
v_count += c0;
|
|
v_count += c1;
|
|
v_count -= is_surrogate;
|
|
|
|
iteration -= 1;
|
|
if (iteration == 0) {
|
|
count += v_count.sum();
|
|
v_count = vector_u16::zero();
|
|
|
|
iteration = max_iterations;
|
|
}
|
|
}
|
|
|
|
if (iteration > 0) {
|
|
count += v_count.sum();
|
|
}
|
|
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
/* end file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
|
|
// file included directly
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf32_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::tuple<const char16_t *, char32_t *, bool>
|
|
convert_utf16_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char16_t *end = buf + len;
|
|
const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
|
|
const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
|
|
const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
|
|
__mmask32 carry{0};
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
while (std::distance(buf, end) >= 32) {
|
|
// Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
if (big_endian) {
|
|
in = _mm512_shuffle_epi8(in, byteflip);
|
|
}
|
|
|
|
// H - bitmask for high surrogates
|
|
const __mmask32 H =
|
|
_mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
|
|
// H - bitmask for low surrogates
|
|
const __mmask32 L =
|
|
_mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
|
|
|
|
if ((H | L)) {
|
|
// surrogate pair(s) in a register
|
|
const __mmask32 V =
|
|
(L ^
|
|
(carry | (H << 1))); // A high surrogate must be followed by low one
|
|
// and a low one must be preceded by a high one.
|
|
// If valid, V should be equal to 0
|
|
|
|
if (V == 0) {
|
|
// valid case
|
|
/*
|
|
Input surrogate pair:
|
|
|1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
|
|
low surrogate high surrogate
|
|
*/
|
|
/* 1. Expand all code units to 32-bit code units
|
|
in
|
|
|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
|
|
*/
|
|
const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
|
|
const __m512i second =
|
|
_mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
|
|
|
|
/* 2. Shift by one 16-bit word to align low surrogates with high
|
|
surrogates in
|
|
|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
|
|
shifted
|
|
|????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
|
|
*/
|
|
const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
|
|
const __m512i shifted_second =
|
|
_mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
|
|
|
|
/* 3. Align all high surrogates in first and second by shifting to the
|
|
left by 10 bits
|
|
|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
|
|
*/
|
|
const __m512i aligned_first =
|
|
_mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
|
|
const __m512i aligned_second =
|
|
_mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
|
|
|
|
/* 4. Remove surrogate prefixes and add offset 0x10000 by adding in,
|
|
shifted and constant in
|
|
|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
|
|
shifted
|
|
|????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
|
|
constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
|
|
*/
|
|
const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
|
|
const __m512i added_first = _mm512_mask_add_epi32(
|
|
aligned_first, (__mmask16)H, aligned_first, shifted_first);
|
|
const __m512i utf32_first = _mm512_mask_add_epi32(
|
|
added_first, (__mmask16)H, added_first, constant);
|
|
|
|
const __m512i added_second =
|
|
_mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16),
|
|
aligned_second, shifted_second);
|
|
const __m512i utf32_second = _mm512_mask_add_epi32(
|
|
added_second, (__mmask16)(H >> 16), added_second, constant);
|
|
|
|
// 5. Store all valid UTF-32 code units (low surrogate positions and
|
|
// 32nd word are invalid)
|
|
const __mmask32 valid = ~L & 0x7fffffff;
|
|
// We deliberately do a _mm512_maskz_compress_epi32 followed by
|
|
// storeu_epi32 to ease performance portability to Zen 4.
|
|
const __m512i compressed_first =
|
|
_mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
|
|
const size_t howmany1 = count_ones((uint16_t)(valid));
|
|
_mm512_storeu_si512((__m512i *)utf32_output, compressed_first);
|
|
utf32_output += howmany1;
|
|
const __m512i compressed_second =
|
|
_mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
|
|
const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
|
|
// The following could be unsafe in some cases?
|
|
//_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
|
|
_mm512_mask_storeu_epi32((__m512i *)utf32_output,
|
|
__mmask16((1 << howmany2) - 1),
|
|
compressed_second);
|
|
utf32_output += howmany2;
|
|
// Only process 31 code units, but keep track if the 31st word is a high
|
|
// surrogate as a carry
|
|
buf += 31;
|
|
carry = (H >> 30) & 0x1;
|
|
} else {
|
|
// invalid case
|
|
return std::make_tuple(buf + carry, utf32_output, false);
|
|
}
|
|
} else {
|
|
// no surrogates
|
|
// extend all thirty-two 16-bit code units to thirty-two 32-bit code units
|
|
_mm512_storeu_si512((__m512i *)(utf32_output),
|
|
_mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
|
|
_mm512_storeu_si512(
|
|
(__m512i *)(utf32_output) + 1,
|
|
_mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
|
|
utf32_output += 32;
|
|
buf += 32;
|
|
carry = 0;
|
|
}
|
|
} // while
|
|
return std::make_tuple(buf + carry, utf32_output, true);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
|
|
// file included directly
|
|
size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *end = buf + len;
|
|
__m512i v_0xFF = _mm512_set1_epi32(0xff);
|
|
__m512i shufmask = _mm512_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
|
|
56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
|
|
while (end - buf >= 16) {
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
|
|
return 0;
|
|
}
|
|
_mm_storeu_si128(
|
|
(__m128i *)latin1_output,
|
|
_mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
|
|
latin1_output += 16;
|
|
buf += 16;
|
|
}
|
|
if (buf < end) {
|
|
uint16_t mask = uint16_t((1 << (end - buf)) - 1);
|
|
__m512i in = _mm512_maskz_loadu_epi32(mask, buf);
|
|
if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
|
|
return 0;
|
|
}
|
|
_mm_mask_storeu_epi8(
|
|
latin1_output, mask,
|
|
_mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
|
|
}
|
|
return len;
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *end = buf + len;
|
|
const char32_t *start = buf;
|
|
__m512i v_0xFF = _mm512_set1_epi32(0xff);
|
|
__m512i shufmask = _mm512_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
|
|
56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
|
|
while (end - buf >= 16) {
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
|
|
while (uint32_t(*buf) <= 0xff) {
|
|
*latin1_output++ = uint8_t(*buf++);
|
|
}
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
latin1_output);
|
|
}
|
|
_mm_storeu_si128(
|
|
(__m128i *)latin1_output,
|
|
_mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
|
|
latin1_output += 16;
|
|
buf += 16;
|
|
}
|
|
if (buf < end) {
|
|
uint16_t mask = uint16_t((1 << (end - buf)) - 1);
|
|
__m512i in = _mm512_maskz_loadu_epi32(mask, buf);
|
|
if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
|
|
while (uint32_t(*buf) <= 0xff) {
|
|
*latin1_output++ = uint8_t(*buf++);
|
|
}
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
latin1_output);
|
|
}
|
|
_mm_mask_storeu_epi8(
|
|
latin1_output, mask,
|
|
_mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
|
|
}
|
|
return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
// Todo: currently, this is just the haswell code, optimize for icelake kernel.
|
|
std::pair<const char32_t *, char *>
|
|
avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const char32_t *end = buf + len;
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
__m256i running_max = _mm256_setzero_si256();
|
|
__m256i forbidden_bytemask = _mm256_setzero_si256();
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
|
|
running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
|
|
|
|
// Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
|
|
// saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
|
|
_mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits
|
|
// (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(
|
|
_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked =
|
|
_mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t *row_2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
|
|
16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(
|
|
utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_extractf128_si256(utf8_packed, 1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
|
|
_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask = _mm256_or_si256(
|
|
forbidden_bytemask,
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be
|
|
// useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m256i shuffle =
|
|
_mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
|
|
2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
|
|
_mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
|
|
_mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
|
|
const __m128i utf8_2 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
|
|
const __m128i utf8_3 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will
|
|
// produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
|
|
// wasteful to use scalar code, but being efficient with SIMD may require
|
|
// large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) { // 2-byte
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
|
|
_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
// Todo: currently, this is just the haswell code, optimize for icelake kernel.
|
|
std::pair<result, char *>
|
|
avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const char32_t *end = buf + len;
|
|
const char32_t *start = buf;
|
|
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
|
|
// Check for too large input
|
|
const __m256i max_input =
|
|
_mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(
|
|
_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
utf8_output);
|
|
}
|
|
|
|
// Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
|
|
// saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
|
|
_mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits
|
|
// (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(
|
|
_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked =
|
|
_mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t *row_2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
|
|
16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(
|
|
utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_extractf128_si256(utf8_packed, 1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
|
|
_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// Check for illegal surrogate code units
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
const __m256i forbidden_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
|
|
0x0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
utf8_output);
|
|
}
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be
|
|
// useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m256i shuffle =
|
|
_mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
|
|
2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
|
|
_mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
|
|
_mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
|
|
const __m128i utf8_2 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
|
|
const __m128i utf8_3 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will
|
|
// produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
|
|
// wasteful to use scalar code, but being efficient with SIMD may require
|
|
// large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) { // 2-byte
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k), utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k), utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
|
|
// file included directly
|
|
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t *, char16_t *>
|
|
avx512_convert_utf32_to_utf16(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char32_t *end = buf + len;
|
|
__mmask32 forbidden_bytemask = 0;
|
|
const __m512i v_00000000 = _mm512_setzero_si512();
|
|
const __m512i v_ffff0000 = _mm512_set1_epi32((int32_t)0xffff0000);
|
|
const __m512i v_f800 = _mm512_set1_epi32((uint32_t)0xf800);
|
|
const __m512i v_d800 = _mm512_set1_epi32((uint32_t)0xd800);
|
|
const __m512i v_10ffff = _mm512_set1_epi32(0x10FFFF);
|
|
const __m512i v_10000 = _mm512_set1_epi32(0x10000);
|
|
const __m512i v_3ff0000 = _mm512_set1_epi32(0x3FF0000);
|
|
const __m512i v_3ff = _mm512_set1_epi32(0x3FF);
|
|
const __m512i v_dc00d800 = _mm512_set1_epi32((int32_t)0xDC00D800);
|
|
|
|
while (end - buf >= std::ptrdiff_t(16)) {
|
|
__m512i in = _mm512_loadu_si512(buf);
|
|
|
|
// no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
|
|
const __mmask16 saturation_bitmask =
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000);
|
|
|
|
if (saturation_bitmask == 0xffff) {
|
|
forbidden_bytemask |=
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800);
|
|
|
|
__m256i utf16_packed = _mm512_cvtepi32_epi16(in);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5,
|
|
4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm256_storeu_si256((__m256i *)utf16_output, utf16_packed);
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
} else {
|
|
// saturation_bitmask == 1 words will generate 1 utf16 char,
|
|
// and saturation_bitmask == 0 words will generate 2 utf16 chars assuming
|
|
// no errors. Thus we need a output_mask which has the structure b_2i = 1,
|
|
// b_2i+1 = !saturation_bitmask_i
|
|
const __mmask32 output_mask = ~_pdep_u32(saturation_bitmask, 0xAAAAAAAA);
|
|
const __mmask16 surrogate_bitmask = __mmask16(~saturation_bitmask);
|
|
__mmask32 error = _mm512_mask_cmpeq_epi32_mask(
|
|
saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800);
|
|
error |= _mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff);
|
|
if (simdutf_unlikely(error)) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
__m512i v1, v2, v;
|
|
// for the bits saturation_bitmask == 0, we need to unpack the 32-bit word
|
|
// into two 16 bit words corresponding to high_surrogate and
|
|
// low_surrogate. Once the bits are unpacked and merged, the output will
|
|
// be compressed as per output_mask.
|
|
in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000);
|
|
v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16);
|
|
v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000);
|
|
v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10);
|
|
v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff);
|
|
v = _mm512_or_si512(v1, v2);
|
|
in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800);
|
|
if (big_endian) {
|
|
const __m512i swap_512 = _mm512_set_epi8(
|
|
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12,
|
|
13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8,
|
|
9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5,
|
|
2, 3, 0, 1);
|
|
in = _mm512_shuffle_epi8(in, swap_512);
|
|
}
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
|
|
// (AMD Zen4 has terrible performance with it, it is effectively broken)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(output_mask, in);
|
|
auto written_out = _mm_popcnt_u32(output_mask);
|
|
_mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out),
|
|
compressed);
|
|
//_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in);
|
|
utf16_output += written_out;
|
|
buf += 16;
|
|
}
|
|
}
|
|
|
|
size_t remaining_len = size_t(end - buf);
|
|
if (remaining_len) {
|
|
__mmask16 input_mask = __mmask16((1 << remaining_len) - 1);
|
|
__m512i in = _mm512_maskz_loadu_epi32(input_mask, buf);
|
|
const __mmask16 saturation_bitmask =
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000) &
|
|
input_mask;
|
|
if (saturation_bitmask == input_mask) {
|
|
forbidden_bytemask |=
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800);
|
|
|
|
__m256i utf16_packed = _mm512_cvtepi32_epi16(in);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5,
|
|
4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm256_mask_storeu_epi16(utf16_output, input_mask, utf16_packed);
|
|
utf16_output += remaining_len;
|
|
buf += remaining_len;
|
|
} else {
|
|
const __mmask32 output_max_mask = (1 << (remaining_len * 2)) - 1;
|
|
const __mmask32 output_mask =
|
|
(~_pdep_u32(saturation_bitmask, 0xAAAAAAAA)) & output_max_mask;
|
|
const __mmask16 surrogate_bitmask =
|
|
__mmask16(~saturation_bitmask) & input_mask;
|
|
__mmask32 error = _mm512_mask_cmpeq_epi32_mask(
|
|
saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800);
|
|
error |= _mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff);
|
|
if (simdutf_unlikely(error)) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
__m512i v1, v2, v;
|
|
in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000);
|
|
v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16);
|
|
v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000);
|
|
v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10);
|
|
v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff);
|
|
v = _mm512_or_si512(v1, v2);
|
|
in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800);
|
|
if (big_endian) {
|
|
const __m512i swap_512 = _mm512_set_epi8(
|
|
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12,
|
|
13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8,
|
|
9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5,
|
|
2, 3, 0, 1);
|
|
in = _mm512_shuffle_epi8(in, swap_512);
|
|
}
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
|
|
// (AMD Zen4 has terrible performance with it, it is effectively broken)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(output_mask, in);
|
|
auto written_out = _mm_popcnt_u32(output_mask);
|
|
_mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out),
|
|
compressed);
|
|
//_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in);
|
|
utf16_output += written_out;
|
|
buf += remaining_len;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (forbidden_bytemask != 0) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t *>
|
|
avx512_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
const __m512i v_00000000 = _mm512_setzero_si512();
|
|
const __m512i v_ffff0000 = _mm512_set1_epi32((int32_t)0xffff0000);
|
|
const __m512i v_f800 = _mm512_set1_epi32((uint32_t)0xf800);
|
|
const __m512i v_d800 = _mm512_set1_epi32((uint32_t)0xd800);
|
|
const __m512i v_10ffff = _mm512_set1_epi32(0x10FFFF);
|
|
const __m512i v_10000 = _mm512_set1_epi32(0x10000);
|
|
const __m512i v_3ff0000 = _mm512_set1_epi32(0x3FF0000);
|
|
const __m512i v_3ff = _mm512_set1_epi32(0x3FF);
|
|
const __m512i v_dc00d800 = _mm512_set1_epi32((int32_t)0xDC00D800);
|
|
int error_idx = 0;
|
|
error_code code = error_code::SUCCESS;
|
|
bool err = false;
|
|
|
|
while (end - buf >= std::ptrdiff_t(16)) {
|
|
__m512i in = _mm512_loadu_si512(buf);
|
|
|
|
// no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
|
|
const __mmask16 saturation_bitmask =
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000);
|
|
|
|
if (saturation_bitmask == 0xffff) {
|
|
__mmask32 forbidden_bytemask =
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800);
|
|
|
|
__m256i utf16_packed = _mm512_cvtepi32_epi16(in);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5,
|
|
4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
if (simdutf_unlikely(forbidden_bytemask)) {
|
|
int idx = _tzcnt_u32(forbidden_bytemask);
|
|
_mm256_mask_storeu_epi16(
|
|
utf16_output, __mmask16(_blsmsk_u32(forbidden_bytemask) >> 1),
|
|
utf16_packed);
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start + idx),
|
|
utf16_output + idx);
|
|
}
|
|
_mm256_storeu_si256((__m256i *)utf16_output, utf16_packed);
|
|
utf16_output += 16;
|
|
} else {
|
|
__mmask32 output_mask = ~_pdep_u32(saturation_bitmask, 0xAAAAAAAA);
|
|
const __mmask16 surrogate_bitmask = __mmask16(~saturation_bitmask);
|
|
__mmask32 error_surrogate = _mm512_mask_cmpeq_epi32_mask(
|
|
saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800);
|
|
__mmask32 error_too_large =
|
|
_mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff);
|
|
if (simdutf_unlikely(error_surrogate || error_too_large)) {
|
|
// Need to find the lowest set bit between the two error masks
|
|
// Need to also write the partial chunk until the error index to output.
|
|
int large_idx = _tzcnt_u32(error_too_large);
|
|
int surrogate_idx = _tzcnt_u32(error_surrogate);
|
|
err = true;
|
|
if (large_idx < surrogate_idx) {
|
|
code = error_code::TOO_LARGE;
|
|
error_idx = large_idx;
|
|
} else {
|
|
code = error_code::SURROGATE;
|
|
error_idx = surrogate_idx;
|
|
}
|
|
output_mask &= ((1 << (2 * error_idx)) - 1);
|
|
}
|
|
__m512i v1, v2, v;
|
|
in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000);
|
|
v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16);
|
|
v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000);
|
|
v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10);
|
|
v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff);
|
|
v = _mm512_or_si512(v1, v2);
|
|
in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800);
|
|
if (big_endian) {
|
|
const __m512i swap_512 = _mm512_set_epi8(
|
|
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12,
|
|
13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8,
|
|
9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5,
|
|
2, 3, 0, 1);
|
|
in = _mm512_shuffle_epi8(in, swap_512);
|
|
}
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
|
|
// (AMD Zen4 has terrible performance with it, it is effectively broken)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(output_mask, in);
|
|
auto written_out = _mm_popcnt_u32(output_mask);
|
|
_mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out),
|
|
compressed);
|
|
//_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in);
|
|
utf16_output += written_out;
|
|
if (simdutf_unlikely(err)) {
|
|
return std::make_pair(result(code, buf - start + error_idx),
|
|
utf16_output);
|
|
}
|
|
}
|
|
buf += 16;
|
|
}
|
|
|
|
size_t remaining_len = size_t(end - buf);
|
|
if (remaining_len) {
|
|
__mmask16 input_mask = __mmask16((1 << remaining_len) - 1);
|
|
__m512i in = _mm512_maskz_loadu_epi32(input_mask, buf);
|
|
const __mmask16 saturation_bitmask =
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_ffff0000), v_00000000) &
|
|
input_mask;
|
|
if (saturation_bitmask == input_mask) {
|
|
__mmask32 forbidden_bytemask =
|
|
_mm512_cmpeq_epi32_mask(_mm512_and_si512(in, v_f800), v_d800);
|
|
__m256i utf16_packed = _mm512_cvtepi32_epi16(in);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5,
|
|
4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm256_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
if (simdutf_unlikely(forbidden_bytemask)) {
|
|
int idx = _tzcnt_u32(forbidden_bytemask);
|
|
_mm256_mask_storeu_epi16(
|
|
utf16_output, __mmask16(_blsmsk_u32(forbidden_bytemask) >> 1),
|
|
utf16_packed);
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start + idx),
|
|
utf16_output + idx);
|
|
}
|
|
_mm256_mask_storeu_epi16(utf16_output, input_mask, utf16_packed);
|
|
utf16_output += remaining_len;
|
|
} else {
|
|
const __mmask32 output_max_mask = (1 << (remaining_len * 2)) - 1;
|
|
__mmask32 output_mask =
|
|
(~_pdep_u32(saturation_bitmask, 0xAAAAAAAA)) & output_max_mask;
|
|
const __mmask16 surrogate_bitmask =
|
|
__mmask16(~saturation_bitmask) & input_mask;
|
|
__mmask32 error_surrogate = _mm512_mask_cmpeq_epi32_mask(
|
|
saturation_bitmask, _mm512_and_si512(in, v_f800), v_d800);
|
|
__mmask32 error_too_large =
|
|
_mm512_mask_cmpgt_epu32_mask(surrogate_bitmask, in, v_10ffff);
|
|
if (simdutf_unlikely(error_surrogate || error_too_large)) {
|
|
int large_idx = _tzcnt_u32(error_too_large);
|
|
int surrogate_idx = _tzcnt_u32(error_surrogate);
|
|
err = true;
|
|
if (large_idx < surrogate_idx) {
|
|
code = error_code::TOO_LARGE;
|
|
error_idx = large_idx;
|
|
} else {
|
|
code = error_code::SURROGATE;
|
|
error_idx = surrogate_idx;
|
|
}
|
|
output_mask &= ((1 << (2 * error_idx)) - 1);
|
|
}
|
|
__m512i v1, v2, v;
|
|
in = _mm512_mask_sub_epi32(in, surrogate_bitmask, in, v_10000);
|
|
v1 = _mm512_mask_slli_epi32(in, surrogate_bitmask, in, 16);
|
|
v1 = _mm512_mask_and_epi32(in, surrogate_bitmask, v1, v_3ff0000);
|
|
v2 = _mm512_mask_srli_epi32(in, surrogate_bitmask, in, 10);
|
|
v2 = _mm512_mask_and_epi32(in, surrogate_bitmask, v2, v_3ff);
|
|
v = _mm512_or_si512(v1, v2);
|
|
in = _mm512_mask_add_epi32(in, surrogate_bitmask, v, v_dc00d800);
|
|
if (big_endian) {
|
|
const __m512i swap_512 = _mm512_set_epi8(
|
|
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12,
|
|
13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8,
|
|
9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5,
|
|
2, 3, 0, 1);
|
|
in = _mm512_shuffle_epi8(in, swap_512);
|
|
}
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
|
|
// (AMD Zen4 has terrible performance with it, it is effectively broken)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(output_mask, in);
|
|
auto written_out = _mm_popcnt_u32(output_mask);
|
|
_mm512_mask_storeu_epi16(utf16_output, _bzhi_u32(0xFFFFFFFF, written_out),
|
|
compressed);
|
|
//_mm512_mask_compressstoreu_epi16(utf16_output, output_mask, in);
|
|
utf16_output += written_out;
|
|
if (simdutf_unlikely(err)) {
|
|
return std::make_pair(result(code, buf - start + error_idx),
|
|
utf16_output);
|
|
}
|
|
}
|
|
buf += remaining_len;
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/icelake/icelake_ascii_validation.inl.cpp */
|
|
// file included directly
|
|
|
|
bool validate_ascii(const char *buf, size_t len) {
|
|
const char *end = buf + len;
|
|
const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
|
|
__m512i running_or = _mm512_setzero_si512();
|
|
for (; end - buf >= 64; buf += 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf);
|
|
running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
|
|
0xf8); // running_or | (utf8 & ascii)
|
|
}
|
|
if (buf < end) {
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8(
|
|
(uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf);
|
|
running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
|
|
0xf8); // running_or | (utf8 & ascii)
|
|
}
|
|
return (_mm512_test_epi8_mask(running_or, running_or) == 0);
|
|
}
|
|
/* end file src/icelake/icelake_ascii_validation.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/icelake/icelake_utf32_validation.inl.cpp */
|
|
// file included directly
|
|
|
|
bool validate_utf32(const char32_t *buf, size_t len) {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return true;
|
|
}
|
|
const char32_t *end = buf + len;
|
|
|
|
const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
|
|
__m512i currentmax = _mm512_setzero_si512();
|
|
__m512i currentoffsetmax = _mm512_setzero_si512();
|
|
|
|
while (buf < end - 16) {
|
|
__m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
|
|
buf += 16;
|
|
currentoffsetmax =
|
|
_mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
|
|
currentmax = _mm512_max_epu32(utf32, currentmax);
|
|
}
|
|
|
|
__m512i utf32 =
|
|
_mm512_maskz_loadu_epi32(__mmask16((1 << (end - buf)) - 1), buf);
|
|
currentoffsetmax =
|
|
_mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
|
|
currentmax = _mm512_max_epu32(utf32, currentmax);
|
|
|
|
const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
|
|
const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
|
|
const auto outside_range = _mm512_cmpgt_epu32_mask(currentmax, standardmax);
|
|
if (outside_range != 0) {
|
|
return false;
|
|
}
|
|
|
|
const auto surrogate =
|
|
_mm512_cmpgt_epu32_mask(currentoffsetmax, standardoffsetmax);
|
|
if (surrogate != 0) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
/* end file src/icelake/icelake_utf32_validation.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
|
|
char *utf8_output,
|
|
int mask_output) {
|
|
__mmask64 nonascii = _mm512_movepi8_mask(input);
|
|
size_t output_size = input_len + (size_t)count_ones(nonascii);
|
|
|
|
// Mask to denote whether the byte is a leading byte that is not ascii
|
|
__mmask64 sixth = _mm512_cmpge_epu8_mask(
|
|
input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000
|
|
|
|
const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
|
|
uint64_t ascii = ~nonascii;
|
|
// the bits in ascii are inverted and zeros are interspersed in between them
|
|
uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
|
|
uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits);
|
|
|
|
// interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
|
|
__m512i input_interleaved = _mm512_permutexvar_epi8(
|
|
_mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
|
|
0x37173616, 0x35153414, 0x33133212, 0x31113010,
|
|
0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
|
|
0x27072606, 0x25052404, 0x23032202, 0x21012000),
|
|
input);
|
|
|
|
// double size of each byte, and insert the leading byte 1100 0010
|
|
|
|
/*
|
|
upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the
|
|
process. We adjust for the bytes that have their two most significant bits.
|
|
This takes care of the first 32 bytes, assuming we interleaved the bytes. */
|
|
__m512i outputA =
|
|
_mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
|
|
outputA = _mm512_mask_add_epi16(
|
|
outputA, (__mmask32)sixth, outputA,
|
|
_mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
|
|
|
|
// in the second 32-bit half, set first or second option based on whether
|
|
// original input is leading byte (second case) or not (first case)
|
|
__m512i leadingB =
|
|
_mm512_mask_blend_epi16((__mmask32)(sixth >> 32),
|
|
_mm512_set1_epi16(0x00c2), // 0000 0000 1101 0010
|
|
_mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011
|
|
__m512i outputB = _mm512_ternarylogic_epi32(
|
|
input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00),
|
|
(240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
|
|
|
|
// prune redundant bytes
|
|
outputA = _mm512_maskz_compress_epi8(maskA, outputA);
|
|
outputB = _mm512_maskz_compress_epi8(maskB, outputB);
|
|
|
|
size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
|
|
|
|
if (mask_output) {
|
|
if (input_len > 32) { // is the second half of the input vector used?
|
|
__mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
|
|
_mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
|
|
utf8_output += output_sizeA;
|
|
write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
|
|
_mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
|
|
} else {
|
|
__mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
|
|
_mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
|
|
}
|
|
} else {
|
|
_mm512_storeu_si512(utf8_output, outputA);
|
|
utf8_output += output_sizeA;
|
|
_mm512_storeu_si512(utf8_output, outputB);
|
|
}
|
|
return output_size;
|
|
}
|
|
|
|
static inline size_t latin1_to_utf8_avx512_branch(__m512i input,
|
|
char *utf8_output) {
|
|
__mmask64 nonascii = _mm512_movepi8_mask(input);
|
|
if (nonascii) {
|
|
return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
|
|
} else {
|
|
_mm512_storeu_si512(utf8_output, input);
|
|
return 64;
|
|
}
|
|
}
|
|
|
|
size_t latin1_to_utf8_avx512_start(const char *buf, size_t len,
|
|
char *utf8_output) {
|
|
char *start = utf8_output;
|
|
size_t pos = 0;
|
|
// if there's at least 128 bytes remaining, we don't need to mask the output
|
|
for (; pos + 128 <= len; pos += 64) {
|
|
__m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
|
|
utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
|
|
}
|
|
// in the last 128 bytes, the first 64 may require masking the output
|
|
if (pos + 64 <= len) {
|
|
__m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
|
|
utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
|
|
pos += 64;
|
|
}
|
|
// with the last 64 bytes, the input also needs to be masked
|
|
if (pos < len) {
|
|
__mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
|
|
__m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
|
|
utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
|
|
}
|
|
return (size_t)(utf8_output - start);
|
|
}
|
|
/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
|
|
// file included directly
|
|
template <endianness big_endian>
|
|
size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
|
|
char16_t *utf16_output) {
|
|
size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32
|
|
|
|
__m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
for (size_t i = 0; i < rounded_len; i += 32) {
|
|
// Load 32 Latin1 characters into a 256-bit register
|
|
__m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
|
|
// Zero extend each set of 8 Latin1 characters to 32 16-bit integers
|
|
__m512i out = _mm512_cvtepu8_epi16(in);
|
|
if (big_endian) {
|
|
out = _mm512_shuffle_epi8(out, byteflip);
|
|
}
|
|
// Store the results back to memory
|
|
_mm512_storeu_si512((__m512i *)&utf16_output[i], out);
|
|
}
|
|
if (rounded_len != len) {
|
|
uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
|
|
__m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
|
|
|
|
// Zero extend each set of 8 Latin1 characters to 32 16-bit integers
|
|
__m512i out = _mm512_cvtepu8_epi16(in);
|
|
if (big_endian) {
|
|
out = _mm512_shuffle_epi8(out, byteflip);
|
|
}
|
|
// Store the results back to memory
|
|
_mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
|
|
}
|
|
|
|
return len;
|
|
}
|
|
/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
|
|
void avx512_convert_latin1_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
while (len >= 16) {
|
|
// Load 16 Latin1 characters into a 128-bit register
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
|
|
// Zero extend each set of 8 Latin1 characters to 16 32-bit integers using
|
|
// vpmovzxbd
|
|
__m512i out = _mm512_cvtepu8_epi32(in);
|
|
|
|
// Store the results back to memory
|
|
_mm512_storeu_si512((__m512i *)utf32_output, out);
|
|
|
|
len -= 16;
|
|
buf += 16;
|
|
utf32_output += 16;
|
|
}
|
|
|
|
__mmask16 mask = __mmask16((1 << len) - 1);
|
|
__m128i in = _mm_maskz_loadu_epi8(mask, buf);
|
|
__m512i out = _mm512_cvtepu8_epi32(in);
|
|
_mm512_mask_storeu_epi32((__m512i *)utf32_output, mask, out);
|
|
}
|
|
/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/icelake/icelake_base64.inl.cpp */
|
|
// file included directly
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
|
|
struct block64 {
|
|
__m512i chunks[1];
|
|
};
|
|
|
|
template <bool base64_url>
|
|
size_t encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
// credit: Wojciech Muła
|
|
const uint8_t *input = (const uint8_t *)src;
|
|
|
|
uint8_t *out = (uint8_t *)dst;
|
|
static const char *lookup_tbl =
|
|
base64_url
|
|
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
|
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
|
|
const __m512i shuffle_input = _mm512_setr_epi32(
|
|
0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
|
|
0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
|
|
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
|
|
const __m512i lookup =
|
|
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
|
|
const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
|
|
size_t size = srclen;
|
|
__mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
|
|
while (size >= 48) {
|
|
const __m512i v = _mm512_maskz_loadu_epi8(
|
|
input_mask, reinterpret_cast<const __m512i *>(input));
|
|
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
|
|
const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
|
|
const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
|
|
_mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
|
|
out += 64;
|
|
input += 48;
|
|
size -= 48;
|
|
}
|
|
input_mask = ((__mmask64)1 << size) - 1;
|
|
const __m512i v = _mm512_maskz_loadu_epi8(
|
|
input_mask, reinterpret_cast<const __m512i *>(input));
|
|
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
|
|
const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
|
|
bool padding_needed =
|
|
(((options & base64_url) == 0) ^
|
|
((options & base64_reverse_padding) == base64_reverse_padding));
|
|
size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
|
|
size_t output_len = ((size + 2) / 3) * 4;
|
|
size_t non_padded_output_len = output_len - padding_amount;
|
|
if (!padding_needed) {
|
|
output_len = non_padded_output_len;
|
|
}
|
|
__mmask64 output_mask = output_len == 64 ? (__mmask64)UINT64_MAX
|
|
: ((__mmask64)1 << output_len) - 1;
|
|
__m512i result = _mm512_mask_permutexvar_epi8(
|
|
_mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
|
|
indices, lookup);
|
|
_mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
|
|
result);
|
|
return (size_t)(out - (uint8_t *)dst) + output_len;
|
|
}
|
|
|
|
template <bool base64_url, bool ignore_garbage>
|
|
static inline uint64_t to_base64_mask(block64 *b, uint64_t *error,
|
|
uint64_t input_mask = UINT64_MAX) {
|
|
__m512i input = b->chunks[0];
|
|
const __m512i ascii_space_tbl = _mm512_set_epi8(
|
|
0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
|
|
9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
|
|
__m512i lookup0;
|
|
if (base64_url) {
|
|
lookup0 = _mm512_set_epi8(
|
|
-128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
|
|
52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
|
|
-128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
|
|
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
|
|
-128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
|
|
} else {
|
|
lookup0 = _mm512_set_epi8(
|
|
-128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
|
|
52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
|
|
-128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
|
|
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
|
|
-128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
|
|
}
|
|
__m512i lookup1;
|
|
if (base64_url) {
|
|
lookup1 = _mm512_set_epi8(
|
|
-128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
|
|
41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
|
|
63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
|
|
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
|
|
} else {
|
|
lookup1 = _mm512_set_epi8(
|
|
-128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
|
|
41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
|
|
-128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
|
|
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
|
|
}
|
|
|
|
const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
|
|
const __m512i combined = _mm512_or_si512(translated, input);
|
|
const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask;
|
|
if (!ignore_garbage && mask) {
|
|
const __mmask64 spaces =
|
|
_mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input),
|
|
input) &
|
|
input_mask;
|
|
*error = (mask ^ spaces);
|
|
}
|
|
b->chunks[0] = translated;
|
|
|
|
return mask | (~input_mask);
|
|
}
|
|
|
|
static inline void copy_block(block64 *b, char *output) {
|
|
_mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
|
|
}
|
|
|
|
static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
|
|
uint64_t nmask = ~mask;
|
|
__m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
|
|
_mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
|
|
return _mm_popcnt_u64(nmask);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 64 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
static inline void load_block(block64 *b, const char *src) {
|
|
b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
|
|
}
|
|
|
|
static inline void load_block_partial(block64 *b, const char *src,
|
|
__mmask64 input_mask) {
|
|
b->chunks[0] = _mm512_maskz_loadu_epi8(
|
|
input_mask, reinterpret_cast<const __m512i *>(src));
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 128 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
static inline void load_block(block64 *b, const char16_t *src) {
|
|
__m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
|
|
__m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
|
|
__m512i p = _mm512_packus_epi16(m1, m2);
|
|
b->chunks[0] =
|
|
_mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
|
|
}
|
|
|
|
static inline void load_block_partial(block64 *b, const char16_t *src,
|
|
__mmask64 input_mask) {
|
|
__m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask,
|
|
reinterpret_cast<const __m512i *>(src));
|
|
__m512i m2 =
|
|
_mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32),
|
|
reinterpret_cast<const __m512i *>(src + 32));
|
|
__m512i p = _mm512_packus_epi16(m1, m2);
|
|
b->chunks[0] =
|
|
_mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
|
|
}
|
|
|
|
static inline void base64_decode(char *out, __m512i str) {
|
|
const __m512i merge_ab_and_bc =
|
|
_mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
|
|
const __m512i merged =
|
|
_mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
|
|
const __m512i pack = _mm512_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
|
|
52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
|
|
28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
|
|
5, 6, 0, 1, 2);
|
|
const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
|
|
_mm512_mask_storeu_epi8(
|
|
(__m512i *)out, 0xffffffffffff,
|
|
shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
|
|
}
|
|
// decode 64 bytes and output 48 bytes
|
|
static inline void base64_decode_block(char *out, const char *src) {
|
|
base64_decode(out,
|
|
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
|
|
}
|
|
static inline void base64_decode_block(char *out, block64 *b) {
|
|
base64_decode(out, b->chunks[0]);
|
|
}
|
|
|
|
template <bool base64_url, bool ignore_garbage, typename chartype>
|
|
full_result
|
|
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
(void)options;
|
|
const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
size_t equallocation =
|
|
srclen; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
// skip trailing spaces
|
|
while (!ignore_garbage && srclen > 0 &&
|
|
scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 1;
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 2;
|
|
}
|
|
}
|
|
if (srclen == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
const chartype *const srcinit = src;
|
|
const char *const dstinit = dst;
|
|
const chartype *const srcend = src + srclen;
|
|
|
|
// figure out why block_size == 2 is sometimes best???
|
|
constexpr size_t block_size = 6;
|
|
char buffer[block_size * 64];
|
|
char *bufferptr = buffer;
|
|
if (srclen >= 64) {
|
|
const chartype *const srcend64 = src + srclen - 64;
|
|
while (src <= srcend64) {
|
|
block64 b;
|
|
load_block(&b, src);
|
|
src += 64;
|
|
uint64_t error = 0;
|
|
uint64_t badcharmask =
|
|
to_base64_mask<base64_url, ignore_garbage>(&b, &error);
|
|
if (!ignore_garbage && error) {
|
|
src -= 64;
|
|
size_t error_offset = _tzcnt_u64(error);
|
|
return {error_code::INVALID_BASE64_CHARACTER,
|
|
size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
|
|
}
|
|
if (badcharmask != 0) {
|
|
// optimization opportunity: check for simple masks like those made of
|
|
// continuous 1s followed by continuous 0s. And masks containing a
|
|
// single bad character.
|
|
bufferptr += compress_block(&b, badcharmask, bufferptr);
|
|
} else if (bufferptr != buffer) {
|
|
copy_block(&b, bufferptr);
|
|
bufferptr += 64;
|
|
} else {
|
|
base64_decode_block(dst, &b);
|
|
dst += 48;
|
|
}
|
|
if (bufferptr >= (block_size - 1) * 64 + buffer) {
|
|
for (size_t i = 0; i < (block_size - 1); i++) {
|
|
base64_decode_block(dst, buffer + i * 64);
|
|
dst += 48;
|
|
}
|
|
std::memcpy(buffer, buffer + (block_size - 1) * 64,
|
|
64); // 64 might be too much
|
|
bufferptr -= (block_size - 1) * 64;
|
|
}
|
|
}
|
|
}
|
|
|
|
int last_block_len = (int)(srcend - src);
|
|
if (last_block_len != 0) {
|
|
__mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1;
|
|
block64 b;
|
|
load_block_partial(&b, src, input_mask);
|
|
uint64_t error = 0;
|
|
uint64_t badcharmask =
|
|
to_base64_mask<base64_url, ignore_garbage>(&b, &error, input_mask);
|
|
if (!ignore_garbage && error) {
|
|
size_t error_offset = _tzcnt_u64(error);
|
|
return {error_code::INVALID_BASE64_CHARACTER,
|
|
size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
|
|
}
|
|
src += last_block_len;
|
|
bufferptr += compress_block(&b, badcharmask, bufferptr);
|
|
}
|
|
|
|
char *buffer_start = buffer;
|
|
for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
|
|
base64_decode_block(dst, buffer_start);
|
|
dst += 48;
|
|
}
|
|
|
|
if ((bufferptr - buffer_start) != 0) {
|
|
size_t rem = (bufferptr - buffer_start);
|
|
int idx = rem % 4;
|
|
__mmask64 mask = ((__mmask64)1 << rem) - 1;
|
|
__m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start);
|
|
size_t output_len = (rem / 4) * 3;
|
|
__mmask64 output_mask = mask >> (rem - output_len);
|
|
const __m512i merge_ab_and_bc =
|
|
_mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140));
|
|
const __m512i merged =
|
|
_mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
|
|
const __m512i pack = _mm512_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
|
|
52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
|
|
28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
|
|
5, 6, 0, 1, 2);
|
|
const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
|
|
|
|
if (!ignore_garbage &&
|
|
last_chunk_options == last_chunk_handling_options::strict &&
|
|
(idx != 1) && ((idx + equalsigns) & 3) != 0) {
|
|
// The partial chunk was at src - idx
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
} else if (!ignore_garbage &&
|
|
last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial &&
|
|
(idx != 1) && ((idx + equalsigns) & 3) != 0) {
|
|
// Rewind src to before partial chunk
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
src -= idx;
|
|
} else {
|
|
if (idx == 2) {
|
|
if (!ignore_garbage &&
|
|
last_chunk_options == last_chunk_handling_options::strict) {
|
|
uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) +
|
|
(uint32_t(bufferptr[-1]) << 2 * 6);
|
|
if (triple & 0xffff) {
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
return {BASE64_EXTRA_BITS, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
output_mask = (output_mask << 1) | 1;
|
|
output_len += 1;
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
} else if (idx == 3) {
|
|
if (!ignore_garbage &&
|
|
last_chunk_options == last_chunk_handling_options::strict) {
|
|
uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) +
|
|
(uint32_t(bufferptr[-2]) << 2 * 6) +
|
|
(uint32_t(bufferptr[-1]) << 1 * 6);
|
|
if (triple & 0xff) {
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
return {BASE64_EXTRA_BITS, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
output_mask = (output_mask << 2) | 3;
|
|
output_len += 2;
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
} else if (!ignore_garbage && idx == 1) {
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
} else {
|
|
_mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
|
|
dst += output_len;
|
|
}
|
|
}
|
|
|
|
if (!ignore_garbage && last_chunk_options != stop_before_partial &&
|
|
equalsigns > 0) {
|
|
size_t output_count = size_t(dst - dstinit);
|
|
if ((output_count % 3 == 0) ||
|
|
((output_count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, output_count};
|
|
}
|
|
}
|
|
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
|
|
}
|
|
if ((size_t(dst - dstinit) % 3 == 0) ||
|
|
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
/* end file src/icelake/icelake_base64.inl.cpp */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
#include <cstdint>
|
|
|
|
} // namespace
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/generic/utf32.h */
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
template <typename T> T min(T a, T b) { return a <= b ? a : b; }
|
|
|
|
simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
|
|
size_t length) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const char32_t *start = input;
|
|
|
|
// we add up to three ones in a single iteration (see the vectorized loop in
|
|
// section #2 below)
|
|
const size_t max_increment = 3;
|
|
|
|
const size_t N = vector_u32::ELEMENTS;
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
const auto v_0000007f = vector_u32::splat(0x0000007f);
|
|
const auto v_000007ff = vector_u32::splat(0x000007ff);
|
|
const auto v_0000ffff = vector_u32::splat(0x0000ffff);
|
|
#else
|
|
const auto v_ffffff80 = vector_u32::splat(0xffffff80);
|
|
const auto v_fffff800 = vector_u32::splat(0xfffff800);
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
const auto one = vector_u32::splat(1);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
size_t counter = 0;
|
|
|
|
// 1. vectorized loop unrolled 4 times
|
|
{
|
|
// we use vector of uint32 counters, this is why this limit is used
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / (max_increment * 4);
|
|
size_t blocks = length / (N * 4);
|
|
length -= blocks * (N * 4);
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
simd32<uint32_t> acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in0 = vector_u32(input + 0 * N);
|
|
const auto in1 = vector_u32(input + 1 * N);
|
|
const auto in2 = vector_u32(input + 2 * N);
|
|
const auto in3 = vector_u32(input + 3 * N);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in0 > v_0000007f);
|
|
acc -= as_vector_u32(in1 > v_0000007f);
|
|
acc -= as_vector_u32(in2 > v_0000007f);
|
|
acc -= as_vector_u32(in3 > v_0000007f);
|
|
|
|
acc -= as_vector_u32(in0 > v_000007ff);
|
|
acc -= as_vector_u32(in1 > v_000007ff);
|
|
acc -= as_vector_u32(in2 > v_000007ff);
|
|
acc -= as_vector_u32(in3 > v_000007ff);
|
|
|
|
acc -= as_vector_u32(in0 > v_0000ffff);
|
|
acc -= as_vector_u32(in1 > v_0000ffff);
|
|
acc -= as_vector_u32(in2 > v_0000ffff);
|
|
acc -= as_vector_u32(in3 > v_0000ffff);
|
|
#else
|
|
acc += min(one, in0 & v_ffffff80);
|
|
acc += min(one, in1 & v_ffffff80);
|
|
acc += min(one, in2 & v_ffffff80);
|
|
acc += min(one, in3 & v_ffffff80);
|
|
|
|
acc += min(one, in0 & v_fffff800);
|
|
acc += min(one, in1 & v_fffff800);
|
|
acc += min(one, in2 & v_fffff800);
|
|
acc += min(one, in3 & v_fffff800);
|
|
|
|
acc += min(one, in0 & v_ffff0000);
|
|
acc += min(one, in1 & v_ffff0000);
|
|
acc += min(one, in2 & v_ffff0000);
|
|
acc += min(one, in3 & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += 4 * N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
// 2. vectorized loop for tail
|
|
{
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / max_increment;
|
|
size_t blocks = length / N;
|
|
length -= blocks * N;
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
auto acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in = vector_u32(input);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in > v_0000007f);
|
|
acc -= as_vector_u32(in > v_000007ff);
|
|
acc -= as_vector_u32(in > v_0000ffff);
|
|
#else
|
|
acc += min(one, in & v_ffffff80);
|
|
acc += min(one, in & v_fffff800);
|
|
acc += min(one, in & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
if (consumed != 0) {
|
|
// We don't count 0th bytes in the vectorized loops above, this
|
|
// is why we need to count them in the end.
|
|
counter += consumed;
|
|
}
|
|
|
|
return counter + scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
|
|
int out = 0;
|
|
uint32_t utf16_err = (length % 2);
|
|
uint32_t utf32_err = (length % 4);
|
|
uint32_t ends_with_high = 0;
|
|
avx512_utf8_checker checker{};
|
|
const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
|
|
__m512i currentmax = _mm512_setzero_si512();
|
|
__m512i currentoffsetmax = _mm512_setzero_si512();
|
|
const char *ptr = input;
|
|
const char *end = ptr + length;
|
|
for (; end - ptr >= 64; ptr += 64) {
|
|
// utf8 checks
|
|
const __m512i data = _mm512_loadu_si512((const __m512i *)ptr);
|
|
checker.check_next_input(data);
|
|
|
|
// utf16le_checks
|
|
__m512i diff = _mm512_sub_epi16(data, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
utf16_err |= (((highsurrogates << 1) | ends_with_high) != lowsurrogates);
|
|
ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
|
|
// utf32le checks
|
|
currentoffsetmax =
|
|
_mm512_max_epu32(_mm512_add_epi32(data, offset), currentoffsetmax);
|
|
currentmax = _mm512_max_epu32(data, currentmax);
|
|
}
|
|
|
|
// last block with 0 <= len < 64
|
|
__mmask64 read_mask = (__mmask64(1) << (end - ptr)) - 1;
|
|
const __m512i data = _mm512_maskz_loadu_epi8(read_mask, (const __m512i *)ptr);
|
|
checker.check_next_input(data);
|
|
|
|
__m512i diff = _mm512_sub_epi16(data, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
utf16_err |= (((highsurrogates << 1) | ends_with_high) != lowsurrogates);
|
|
|
|
currentoffsetmax =
|
|
_mm512_max_epu32(_mm512_add_epi32(data, offset), currentoffsetmax);
|
|
currentmax = _mm512_max_epu32(data, currentmax);
|
|
|
|
const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
|
|
const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
|
|
__m512i is_zero =
|
|
_mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
|
|
utf32_err |= (_mm512_test_epi8_mask(is_zero, is_zero) != 0);
|
|
is_zero = _mm512_xor_si512(
|
|
_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
utf32_err |= (_mm512_test_epi8_mask(is_zero, is_zero) != 0);
|
|
checker.check_eof();
|
|
bool is_valid_utf8 = !checker.errors();
|
|
if (is_valid_utf8) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if (utf16_err == 0) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
if (utf32_err == 0) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return true;
|
|
}
|
|
avx512_utf8_checker checker{};
|
|
const char *ptr = buf;
|
|
const char *end = ptr + len;
|
|
for (; end - ptr >= 64; ptr += 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
if (end != ptr) {
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8(
|
|
~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
return !checker.errors();
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
avx512_utf8_checker checker{};
|
|
const char *ptr = buf;
|
|
const char *end = ptr + len;
|
|
size_t count{0};
|
|
for (; end - ptr >= 64; ptr += 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
checker.check_next_input(utf8);
|
|
if (checker.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(buf),
|
|
reinterpret_cast<const char *>(buf + count), len - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
count += 64;
|
|
}
|
|
if (end != ptr) {
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8(
|
|
~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
if (checker.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(buf),
|
|
reinterpret_cast<const char *>(buf + count), len - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return icelake::validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
const char *buf_orig = buf;
|
|
const char *end = buf + len;
|
|
const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
|
|
for (; end - buf >= 64; buf += 64) {
|
|
const __m512i input = _mm512_loadu_si512((const __m512i *)buf);
|
|
__mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
|
|
if (notascii) {
|
|
return result(error_code::TOO_LARGE,
|
|
buf - buf_orig + _tzcnt_u64(notascii));
|
|
}
|
|
}
|
|
if (end != buf) {
|
|
const __m512i input = _mm512_maskz_loadu_epi8(
|
|
~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf);
|
|
__mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
|
|
if (notascii) {
|
|
return result(error_code::TOO_LARGE,
|
|
buf - buf_orig + _tzcnt_u64(notascii));
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
const char16_t *end = buf + len;
|
|
|
|
for (; end - buf >= 32;) {
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if (ends_with_high) {
|
|
buf += 31; // advance only by 31 code units so that we start with the
|
|
// high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if (buf < end) {
|
|
__m512i in =
|
|
_mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
const char16_t *end = buf + len;
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
for (; end - buf >= 32;) {
|
|
__m512i in =
|
|
_mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if (ends_with_high) {
|
|
buf += 31; // advance only by 31 code units so that we start with the
|
|
// high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if (buf < end) {
|
|
__m512i in = _mm512_shuffle_epi8(
|
|
_mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
|
|
byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t *start_buf = buf;
|
|
const char16_t *end = buf + len;
|
|
for (; end - buf >= 32;) {
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
|
|
uint32_t extra_high =
|
|
_tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE,
|
|
(buf - start_buf) +
|
|
(extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if (ends_with_high) {
|
|
buf += 31; // advance only by 31 code units so that we start with the
|
|
// high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if (buf < end) {
|
|
__m512i in =
|
|
_mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
|
|
uint32_t extra_high =
|
|
_tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE,
|
|
(buf - start_buf) +
|
|
(extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t *start_buf = buf;
|
|
const char16_t *end = buf + len;
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
for (; end - buf >= 32;) {
|
|
__m512i in =
|
|
_mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
|
|
uint32_t extra_high =
|
|
_tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE,
|
|
(buf - start_buf) +
|
|
(extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if (ends_with_high) {
|
|
buf += 31; // advance only by 31 code units so that we start with the
|
|
// high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if (buf < end) {
|
|
__m512i in = _mm512_shuffle_epi8(
|
|
_mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
|
|
byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
__mmask32 highsurrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
|
|
uint32_t extra_high =
|
|
_tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE,
|
|
(buf - start_buf) +
|
|
(extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_avx512<endianness::LITTLE>(input, len, output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_avx512<endianness::BIG>(input, len, output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
return icelake::validate_utf32(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
const char32_t *buf_orig = buf;
|
|
if (len >= 16) {
|
|
const char32_t *end = buf + len - 16;
|
|
while (buf <= end) {
|
|
__m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
|
|
__mmask16 outside_range = _mm512_cmp_epu32_mask(
|
|
utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
|
|
|
|
__m512i utf32_off =
|
|
_mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
|
|
|
|
__mmask16 surrogate_range = _mm512_cmp_epu32_mask(
|
|
utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
|
|
if ((outside_range | surrogate_range)) {
|
|
auto outside_idx = _tzcnt_u32(outside_range);
|
|
auto surrogate_idx = _tzcnt_u32(surrogate_range);
|
|
|
|
if (outside_idx < surrogate_idx) {
|
|
return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
|
|
}
|
|
|
|
return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
|
|
}
|
|
|
|
buf += 16;
|
|
}
|
|
}
|
|
if (len > 0) {
|
|
__m512i utf32 = _mm512_maskz_loadu_epi32(
|
|
__mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf);
|
|
__mmask16 outside_range = _mm512_cmp_epu32_mask(
|
|
utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
|
|
__m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
|
|
|
|
__mmask16 surrogate_range = _mm512_cmp_epu32_mask(
|
|
utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
|
|
if ((outside_range | surrogate_range)) {
|
|
auto outside_idx = _tzcnt_u32(outside_range);
|
|
auto surrogate_idx = _tzcnt_u32(surrogate_range);
|
|
|
|
if (outside_idx < surrogate_idx) {
|
|
return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
|
|
}
|
|
|
|
return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return icelake_convert_latin1_to_utf16<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return icelake_convert_latin1_to_utf16<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
avx512_convert_latin1_to_utf32(buf, len, utf32_output);
|
|
return len;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
// First, try to convert as much as possible using the SIMD implementation.
|
|
const char *obuf = buf;
|
|
char *olatin1_output = latin1_output;
|
|
size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);
|
|
|
|
// If we have completely converted the string
|
|
if (obuf == buf + len) {
|
|
return {simdutf::SUCCESS, written};
|
|
}
|
|
size_t pos = obuf - buf;
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, buf + pos, len - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret =
|
|
fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
if (ret.second == nullptr) {
|
|
return 0;
|
|
}
|
|
return ret.second - utf16_output;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(
|
|
buf, len, utf16_output);
|
|
if (ret.second == nullptr) {
|
|
return 0;
|
|
}
|
|
return ret.second - utf16_output;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(
|
|
buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(
|
|
buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret =
|
|
icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(
|
|
buf, len, utf16_output);
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
const char *end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outsiede 16-byte window.
|
|
// It meas, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret =
|
|
icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(
|
|
buf, len, utf16_output);
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
const char *end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outsiede 16-byte window.
|
|
// It meas, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_out) const noexcept {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
utf8_to_utf32_result ret =
|
|
icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
|
|
buf, len, utf32_output);
|
|
if (ret.second == nullptr)
|
|
return 0;
|
|
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
const char *end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: the AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outside 16-byte window.
|
|
// It means, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
|
|
ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return {error_code::SUCCESS, 0};
|
|
}
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32);
|
|
auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<
|
|
endianness::LITTLE, uint32_t>(buf, len, utf32_output);
|
|
|
|
if (!std::get<2>(ret)) {
|
|
size_t pos = std::get<0>(ret) - buf;
|
|
// We might have an error that occurs right before pos.
|
|
// This is only a concern if buf[pos] is not a continuation byte.
|
|
if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
|
|
pos -= 1;
|
|
} else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
|
|
// We must check whether we are the fourth continuation byte
|
|
bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
|
|
bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
|
|
bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
|
|
if (c1 && c2 && c3) {
|
|
return {simdutf::TOO_LONG, pos};
|
|
}
|
|
}
|
|
// todo: we reset the output to utf32 instead of using std::get<2.(ret) as
|
|
// you'd expect. that is because
|
|
// validating_utf8_to_fixed_length_with_constant_checks may have processed
|
|
// data beyond the error.
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, buf + pos, len - pos, utf32);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
const char *end = buf + len;
|
|
if (std::get<0>(ret) == end) {
|
|
return {simdutf::SUCCESS, saved_bytes};
|
|
}
|
|
|
|
// Note: the AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outside 16-byte window.
|
|
// It means, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (std::get<0>(ret) != end and
|
|
((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
|
|
std::get<0>(ret) += 1;
|
|
}
|
|
|
|
if (std::get<0>(ret) != end) {
|
|
auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf),
|
|
reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
|
|
if (scalar_result.error != simdutf::SUCCESS) {
|
|
scalar_result.count += (std::get<0>(ret) - buf);
|
|
} else {
|
|
scalar_result.count += saved_bytes;
|
|
}
|
|
return scalar_result;
|
|
}
|
|
|
|
return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_out) const noexcept {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
utf8_to_utf32_result ret =
|
|
icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
|
|
buf, len, utf32_output);
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
const char *end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outsiede 16-byte window.
|
|
// It meas, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
|
|
ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake_convert_utf16_to_latin1<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
|
|
buf, len, latin1_output)
|
|
.first;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake_convert_utf16_to_latin1_with_errors<endianness::BIG>(
|
|
buf, len, latin1_output)
|
|
.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement custom function
|
|
return convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement custom function
|
|
return convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
|
|
buf, len, (unsigned char *)utf8_output, &outlen);
|
|
if (inlen != len) {
|
|
return 0;
|
|
}
|
|
return outlen;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
|
|
buf, len, (unsigned char *)utf8_output, &outlen);
|
|
if (inlen != len) {
|
|
return 0;
|
|
}
|
|
return outlen;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
|
|
buf, len, (unsigned char *)utf8_output, &outlen);
|
|
if (inlen != len) {
|
|
result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + inlen, len - inlen, utf8_output + outlen);
|
|
res.count += inlen;
|
|
return res;
|
|
}
|
|
return {simdutf::SUCCESS, outlen};
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
|
|
buf, len, (unsigned char *)utf8_output, &outlen);
|
|
if (inlen != len) {
|
|
result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + inlen, len - inlen, utf8_output + outlen);
|
|
res.count += inlen;
|
|
return res;
|
|
}
|
|
return {simdutf::SUCCESS, outlen};
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output)
|
|
.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
avx512_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
|
|
buf, len, utf16_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::tuple<const char16_t *, char32_t *, bool> ret =
|
|
icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::tuple<const char16_t *, char32_t *, bool> ret =
|
|
icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::tuple<const char16_t *, char32_t *, bool> ret =
|
|
icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_res.error) {
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
} else {
|
|
scalar_res.count += saved_bytes;
|
|
return scalar_res;
|
|
}
|
|
}
|
|
return simdutf::result(simdutf::SUCCESS, saved_bytes);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::tuple<const char16_t *, char32_t *, bool> ret =
|
|
icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_res.error) {
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
} else {
|
|
scalar_res.count += saved_bytes;
|
|
return scalar_res;
|
|
}
|
|
}
|
|
return simdutf::result(simdutf::SUCCESS, saved_bytes);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::tuple<const char16_t *, char32_t *, bool> ret =
|
|
icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::tuple<const char16_t *, char32_t *, bool> ret =
|
|
icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
size_t pos = 0;
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
while (pos + 32 <= length) {
|
|
__m512i utf16 = _mm512_loadu_si512((const __m512i *)(input + pos));
|
|
utf16 = _mm512_shuffle_epi8(utf16, byteflip);
|
|
_mm512_storeu_si512(output + pos, utf16);
|
|
pos += 32;
|
|
}
|
|
if (pos < length) {
|
|
__mmask32 m((1U << (length - pos)) - 1);
|
|
__m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i *)(input + pos));
|
|
utf16 = _mm512_shuffle_epi8(utf16, byteflip);
|
|
_mm512_mask_storeu_epi16(output + pos, m, utf16);
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
const char16_t *ptr = input;
|
|
size_t count{0};
|
|
|
|
if (length >= 32) {
|
|
const char16_t *end = input + length - 32;
|
|
|
|
const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
|
|
const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
|
|
|
|
while (ptr <= end) {
|
|
__m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
ptr += 32;
|
|
uint64_t not_high_surrogate =
|
|
static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
|
|
_mm512_cmplt_epu16_mask(utf16, low));
|
|
count += count_ones(not_high_surrogate);
|
|
}
|
|
}
|
|
|
|
return count + scalar::utf16::count_code_points<endianness::LITTLE>(
|
|
ptr, length - (ptr - input));
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
const char16_t *ptr = input;
|
|
size_t count{0};
|
|
if (length >= 32) {
|
|
|
|
const char16_t *end = input + length - 32;
|
|
|
|
const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
|
|
const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
|
|
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001, 0x0e0f0c0d0a0b0809);
|
|
while (ptr <= end) {
|
|
__m512i utf16 =
|
|
_mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)ptr), byteflip);
|
|
ptr += 32;
|
|
uint64_t not_high_surrogate =
|
|
static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
|
|
_mm512_cmplt_epu16_mask(utf16, low));
|
|
count += count_ones(not_high_surrogate);
|
|
}
|
|
}
|
|
|
|
return count + scalar::utf16::count_code_points<endianness::BIG>(
|
|
ptr, length - (ptr - input));
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *input, size_t length) const noexcept {
|
|
const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
|
|
size_t answer =
|
|
length / sizeof(__m512i) *
|
|
sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
|
|
size_t i = 0;
|
|
__m512i unrolled_popcount{0};
|
|
|
|
const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
|
|
|
|
while (i + sizeof(__m512i) <= length) {
|
|
size_t iterations = (length - i) / sizeof(__m512i);
|
|
|
|
size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
|
|
for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) {
|
|
__m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
|
|
__m512i input2 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
|
|
__m512i input3 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i)));
|
|
__m512i input4 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i)));
|
|
__m512i input5 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i)));
|
|
__m512i input6 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i)));
|
|
__m512i input7 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i)));
|
|
__m512i input8 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i)));
|
|
|
|
__mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
|
|
__mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
|
|
__mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
|
|
__mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
|
|
__mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
|
|
__mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
|
|
__mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
|
|
__mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
|
|
|
|
__m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5,
|
|
mask4, mask3, mask2, mask1);
|
|
|
|
unrolled_popcount = _mm512_add_epi64(unrolled_popcount,
|
|
_mm512_popcnt_epi64(mask_register));
|
|
}
|
|
|
|
for (; i <= max_i; i += sizeof(__m512i)) {
|
|
__m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
|
|
uint64_t continuation_bitmask = static_cast<uint64_t>(
|
|
_mm512_cmple_epi8_mask(more_input, continuation));
|
|
answer -= count_ones(continuation_bitmask);
|
|
}
|
|
}
|
|
|
|
answer -= _mm512_reduce_add_epi64(unrolled_popcount);
|
|
|
|
return answer + scalar::utf8::count_code_points(
|
|
reinterpret_cast<const char *>(str + i), length - i);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return icelake_utf8_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return icelake_utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return implementation::count_utf16le(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return implementation::count_utf16be(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t length) const noexcept {
|
|
const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
|
|
size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
|
|
size_t i = 0;
|
|
if (answer >= 2048) { // long strings optimization
|
|
unsigned char v_0xFF = 0xff;
|
|
__m512i eight_64bits = _mm512_setzero_si512();
|
|
while (i + sizeof(__m512i) <= length) {
|
|
__m512i runner = _mm512_setzero_si512();
|
|
size_t iterations = (length - i) / sizeof(__m512i);
|
|
if (iterations > 255) {
|
|
iterations = 255;
|
|
}
|
|
size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
|
|
for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) {
|
|
// Load four __m512i vectors
|
|
__m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
|
|
__m512i input2 =
|
|
_mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
|
|
__m512i input3 = _mm512_loadu_si512(
|
|
(const __m512i *)(str + i + 2 * sizeof(__m512i)));
|
|
__m512i input4 = _mm512_loadu_si512(
|
|
(const __m512i *)(str + i + 3 * sizeof(__m512i)));
|
|
|
|
// Generate four masks
|
|
__mmask64 mask1 =
|
|
_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
|
|
__mmask64 mask2 =
|
|
_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
|
|
__mmask64 mask3 =
|
|
_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
|
|
__mmask64 mask4 =
|
|
_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
|
|
// Apply the masks and subtract from the runner
|
|
__m512i not_ascii1 =
|
|
_mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
|
|
__m512i not_ascii2 =
|
|
_mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
|
|
__m512i not_ascii3 =
|
|
_mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
|
|
__m512i not_ascii4 =
|
|
_mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
|
|
|
|
runner = _mm512_sub_epi8(runner, not_ascii1);
|
|
runner = _mm512_sub_epi8(runner, not_ascii2);
|
|
runner = _mm512_sub_epi8(runner, not_ascii3);
|
|
runner = _mm512_sub_epi8(runner, not_ascii4);
|
|
}
|
|
|
|
for (; i <= max_i; i += sizeof(__m512i)) {
|
|
__m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
|
|
|
|
__mmask64 mask =
|
|
_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
|
|
__m512i not_ascii =
|
|
_mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
|
|
runner = _mm512_sub_epi8(runner, not_ascii);
|
|
}
|
|
|
|
eight_64bits = _mm512_add_epi64(
|
|
eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
|
|
}
|
|
|
|
answer += _mm512_reduce_add_epi64(eight_64bits);
|
|
} else if (answer > 0) {
|
|
for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
|
|
__m512i latin = _mm512_loadu_si512((const __m512i *)(str + i));
|
|
uint64_t non_ascii = _mm512_movepi8_mask(latin);
|
|
answer += count_ones(non_ascii);
|
|
}
|
|
}
|
|
return answer + scalar::latin1::utf8_length_from_latin1(
|
|
reinterpret_cast<const char *>(str + i), length - i);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
size_t pos = 0;
|
|
|
|
// UTF-16 char length based on the four most significant bits of UTF-8 bytes
|
|
const __m128i utf8_length_128 = _mm_setr_epi8(
|
|
// ASCII chars
|
|
/* 0000 */ 1,
|
|
/* 0001 */ 1,
|
|
/* 0010 */ 1,
|
|
/* 0011 */ 1,
|
|
/* 0100 */ 1,
|
|
/* 0101 */ 1,
|
|
/* 0110 */ 1,
|
|
/* 0111 */ 1,
|
|
|
|
// continutation bytes
|
|
/* 1000 */ 0,
|
|
/* 1001 */ 0,
|
|
/* 1010 */ 0,
|
|
/* 1011 */ 0,
|
|
|
|
// leading bytes
|
|
/* 1100 */ 1, // 2-byte UTF-8 char => 1 UTF-16 word
|
|
/* 1101 */ 1, // 2-byte UTF-8 char => 1 UTF-16 word
|
|
/* 1110 */ 1, // 3-byte UTF-8 char => 1 UTF-16 word
|
|
/* 1111 */ 2 // 4-byte UTF-8 char => 2 UTF-16 words (surrogate pair)
|
|
);
|
|
|
|
const __m512i char_length = broadcast_128bit_lane(utf8_length_128);
|
|
|
|
constexpr size_t max_iterations = 255 / 2;
|
|
|
|
size_t iterations = 0;
|
|
const auto zero = _mm512_setzero_si512();
|
|
__m512i local = _mm512_setzero_si512(); // byte-wise counters
|
|
__m512i counters = _mm512_setzero_si512(); // 64-bit counters
|
|
for (; pos + 64 <= length; pos += 64) {
|
|
__m512i utf8 = _mm512_loadu_si512((const __m512i *)(input + pos));
|
|
const auto t0 = _mm512_srli_epi32(utf8, 4);
|
|
const auto t1 = _mm512_and_si512(t0, _mm512_set1_epi8(0xf));
|
|
const auto t2 = _mm512_shuffle_epi8(char_length, t1);
|
|
local = _mm512_add_epi8(local, t2);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters = _mm512_add_epi64(counters, _mm512_sad_epu8(local, zero));
|
|
local = zero;
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
size_t count = 0;
|
|
|
|
if (pos > 0) {
|
|
// don't waste time for short strings
|
|
if (iterations > 0) {
|
|
counters = _mm512_add_epi64(counters, _mm512_sad_epu8(local, zero));
|
|
}
|
|
|
|
const auto l0 = _mm512_extracti32x4_epi32(counters, 0);
|
|
const auto l1 = _mm512_extracti32x4_epi32(counters, 1);
|
|
const auto l2 = _mm512_extracti32x4_epi32(counters, 2);
|
|
const auto l3 = _mm512_extracti32x4_epi32(counters, 3);
|
|
|
|
const auto sum =
|
|
_mm_add_epi64(_mm_add_epi64(l0, l1), _mm_add_epi64(l2, l3));
|
|
|
|
count = uint64_t(_mm_extract_epi64(sum, 0)) +
|
|
uint64_t(_mm_extract_epi64(sum, 1));
|
|
}
|
|
|
|
return count +
|
|
scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
const char32_t *ptr = input;
|
|
size_t count{0};
|
|
|
|
if (length >= 16) {
|
|
const char32_t *end = input + length - 16;
|
|
|
|
const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
|
|
|
|
while (ptr <= end) {
|
|
__m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
|
|
ptr += 16;
|
|
__mmask16 surrogates_bitmask =
|
|
_mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
|
|
|
|
count += 16 + count_ones(surrogates_bitmask);
|
|
}
|
|
}
|
|
|
|
return count +
|
|
scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return implementation::count_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
if (options & base64_url) {
|
|
return encode_base64<true>(output, input, length, options);
|
|
} else {
|
|
return encode_base64<false>(output, input, length, options);
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/icelake/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/end.h */
|
|
/* end file src/icelake/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
/* begin file src/haswell/implementation.cpp */
|
|
/* begin file src/simdutf/haswell/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "haswell"
|
|
// #define SIMDUTF_IMPLEMENTATION haswell
|
|
#define SIMDUTF_SIMD_HAS_BYTEMASK 1
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_HASWELL
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
// clang-format off
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
// clang-format on
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/begin.h */
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
#ifndef SIMDUTF_HASWELL_H
|
|
#error "haswell.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \
|
|
SIMDUTF_FEATURE_UTF8
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING ||
|
|
// SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_really_inline simd8<bool>
|
|
must_be_2_3_continuation(const simd8<uint8_t> prev2,
|
|
const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte =
|
|
prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
|
|
simd8<uint8_t> is_fourth_byte =
|
|
prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
|
|
return simd8<bool>(is_third_byte | is_fourth_byte);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
namespace utf16 {
|
|
/* begin file src/haswell/avx2_validate_utf16.cpp */
|
|
template <endianness big_endian>
|
|
simd8<uint8_t> utf16_gather_high_bytes(const simd16<uint16_t> &in0,
|
|
const simd16<uint16_t> &in1) {
|
|
if (big_endian) {
|
|
// we want lower bytes
|
|
const auto mask = simd16<uint16_t>(0x00ff);
|
|
const auto t0 = in0 & mask;
|
|
const auto t1 = in1 & mask;
|
|
|
|
return simd16<uint16_t>::pack(t0, t1);
|
|
} else {
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
return simd16<uint16_t>::pack(t0, t1);
|
|
}
|
|
}
|
|
/* end file src/haswell/avx2_validate_utf16.cpp */
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/haswell/avx2_utf16fix.cpp */
|
|
/*
|
|
* Process one block of 16 characters. If in_place is false,
|
|
* copy the block from in to out. If there is a sequencing
|
|
* error in the block, overwrite the illsequenced characters
|
|
* with the replacement character. This function reads one
|
|
* character before the beginning of the buffer as a lookback.
|
|
* If that character is illsequenced, it too is overwritten.
|
|
*/
|
|
template <endianness big_endian, bool in_place>
|
|
void utf16fix_block(char16_t *out, const char16_t *in) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
auto swap_if_needed = [](uint16_t c) -> uint16_t {
|
|
return !simdutf::match_system(big_endian) ? scalar::u16_swap_bytes(c) : c;
|
|
};
|
|
__m256i lookback, block, lb_masked, block_masked, lb_is_high, block_is_low;
|
|
__m256i illseq, lb_illseq, block_illseq, lb_illseq_shifted;
|
|
|
|
lookback = _mm256_loadu_si256((const __m256i *)(in - 1));
|
|
block = _mm256_loadu_si256((const __m256i *)in);
|
|
lb_masked =
|
|
_mm256_and_si256(lookback, _mm256_set1_epi16(swap_if_needed(0xfc00u)));
|
|
block_masked =
|
|
_mm256_and_si256(block, _mm256_set1_epi16(swap_if_needed(0xfc00u)));
|
|
lb_is_high =
|
|
_mm256_cmpeq_epi16(lb_masked, _mm256_set1_epi16(swap_if_needed(0xd800u)));
|
|
block_is_low = _mm256_cmpeq_epi16(block_masked,
|
|
_mm256_set1_epi16(swap_if_needed(0xdc00u)));
|
|
|
|
illseq = _mm256_xor_si256(lb_is_high, block_is_low);
|
|
if (!_mm256_testz_si256(illseq, illseq)) {
|
|
int lb;
|
|
|
|
/* compute the cause of the illegal sequencing */
|
|
lb_illseq = _mm256_andnot_si256(block_is_low, lb_is_high);
|
|
lb_illseq_shifted =
|
|
_mm256_or_si256(_mm256_bsrli_epi128(lb_illseq, 2),
|
|
_mm256_zextsi128_si256(_mm_bslli_si128(
|
|
_mm256_extracti128_si256(lb_illseq, 1), 14)));
|
|
block_illseq = _mm256_or_si256(
|
|
_mm256_andnot_si256(lb_is_high, block_is_low), lb_illseq_shifted);
|
|
|
|
/* fix illegal sequencing in the lookback */
|
|
lb = _mm256_cvtsi256_si32(lb_illseq);
|
|
lb = (lb & replacement) | (~lb & out[-1]);
|
|
out[-1] = char16_t(lb);
|
|
|
|
/* fix illegal sequencing in the main block */
|
|
block =
|
|
_mm256_blendv_epi8(block, _mm256_set1_epi16(replacement), block_illseq);
|
|
_mm256_storeu_si256((__m256i *)out, block);
|
|
} else if (!in_place) {
|
|
_mm256_storeu_si256((__m256i *)out, block);
|
|
}
|
|
}
|
|
|
|
template <endianness big_endian, bool in_place>
|
|
void utf16fix_block_sse(char16_t *out, const char16_t *in) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
auto swap_if_needed = [](uint16_t c) -> uint16_t {
|
|
return !simdutf::match_system(big_endian) ? scalar::u16_swap_bytes(c) : c;
|
|
};
|
|
|
|
__m128i lookback, block, lb_masked, block_masked, lb_is_high, block_is_low;
|
|
__m128i illseq, lb_illseq, block_illseq;
|
|
|
|
lookback = _mm_loadu_si128((const __m128i *)(in - 1));
|
|
block = _mm_loadu_si128((const __m128i *)in);
|
|
lb_masked = _mm_and_si128(lookback, _mm_set1_epi16(swap_if_needed(0xfc00U)));
|
|
block_masked = _mm_and_si128(block, _mm_set1_epi16(swap_if_needed(0xfc00U)));
|
|
lb_is_high =
|
|
_mm_cmpeq_epi16(lb_masked, _mm_set1_epi16(swap_if_needed(0xd800U)));
|
|
block_is_low =
|
|
_mm_cmpeq_epi16(block_masked, _mm_set1_epi16(swap_if_needed(0xdc00U)));
|
|
|
|
illseq = _mm_xor_si128(lb_is_high, block_is_low);
|
|
if (_mm_movemask_epi8(illseq) != 0) {
|
|
/* compute the cause of the illegal sequencing */
|
|
lb_illseq = _mm_andnot_si128(block_is_low, lb_is_high);
|
|
block_illseq = _mm_or_si128(_mm_andnot_si128(lb_is_high, block_is_low),
|
|
_mm_bsrli_si128(lb_illseq, 2));
|
|
/* fix illegal sequencing in the lookback */
|
|
int lb = _mm_cvtsi128_si32(lb_illseq);
|
|
lb = (lb & replacement) | (~lb & out[-1]);
|
|
out[-1] = char16_t(lb);
|
|
/* fix illegal sequencing in the main block */
|
|
block =
|
|
_mm_or_si128(_mm_andnot_si128(block_illseq, block),
|
|
_mm_and_si128(block_illseq, _mm_set1_epi16(replacement)));
|
|
_mm_storeu_si128((__m128i *)out, block);
|
|
} else if (!in_place) {
|
|
_mm_storeu_si128((__m128i *)out, block);
|
|
}
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
void utf16fix_sse(const char16_t *in, size_t n, char16_t *out) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
size_t i;
|
|
|
|
if (n < 9) {
|
|
scalar::utf16::to_well_formed_utf16<big_endian>(in, n, out);
|
|
return;
|
|
}
|
|
|
|
out[0] =
|
|
scalar::utf16::is_low_surrogate<big_endian>(in[0]) ? replacement : in[0];
|
|
|
|
/* duplicate code to have the compiler specialise utf16fix_block() */
|
|
if (in == out) {
|
|
for (i = 1; i + 8 < n; i += 8) {
|
|
utf16fix_block_sse<big_endian, true>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block_sse<big_endian, true>(out + n - 8, in + n - 8);
|
|
} else {
|
|
for (i = 1; i + 8 < n; i += 8) {
|
|
utf16fix_block_sse<big_endian, false>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block_sse<big_endian, false>(out + n - 8, in + n - 8);
|
|
}
|
|
|
|
out[n - 1] = scalar::utf16::is_high_surrogate<big_endian>(out[n - 1])
|
|
? replacement
|
|
: out[n - 1];
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
void utf16fix_avx(const char16_t *in, size_t n, char16_t *out) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
size_t i;
|
|
|
|
if (n < 17) {
|
|
utf16fix_sse<big_endian>(in, n, out);
|
|
return;
|
|
}
|
|
|
|
out[0] =
|
|
scalar::utf16::is_low_surrogate<big_endian>(in[0]) ? replacement : in[0];
|
|
|
|
/* duplicate code to have the compiler specialise utf16fix_block() */
|
|
if (in == out) {
|
|
for (i = 1; i + 16 < n; i += 16) {
|
|
utf16fix_block<big_endian, true>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block<big_endian, true>(out + n - 16, in + n - 16);
|
|
} else {
|
|
for (i = 1; i + 16 < n; i += 16) {
|
|
utf16fix_block<big_endian, false>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block<big_endian, false>(out + n - 16, in + n - 16);
|
|
}
|
|
|
|
out[n - 1] = scalar::utf16::is_high_surrogate<big_endian>(out[n - 1])
|
|
? replacement
|
|
: out[n - 1];
|
|
}
|
|
/* end file src/haswell/avx2_utf16fix.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */
|
|
std::pair<const char *, char *>
|
|
avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
|
|
char *utf8_output) {
|
|
const char *end = latin1_input + len;
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
|
|
const size_t safety_margin = 12;
|
|
|
|
while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m128i v_80 = _mm_set1_epi8((char)0x80);
|
|
if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
|
|
// 1. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, in8);
|
|
// 2. adjust pointers
|
|
latin1_input += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// We proceed only with the first 16 bytes.
|
|
const __m256i in = _mm256_cvtepu8_epi16((in8));
|
|
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0000|aabb|bbbb] x 8
|
|
// expected output : [1100|00aa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [0000|00aa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in, 2);
|
|
// t1 = [0000|00aa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [1100|00aa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t *row_2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
|
|
[0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(
|
|
utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_extractf128_si256(utf8_packed, 1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
latin1_input += 16;
|
|
continue;
|
|
|
|
} // while
|
|
return std::make_pair(latin1_input, utf8_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char *, char16_t *>
|
|
avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len,
|
|
char16_t *utf16_output) {
|
|
size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
|
|
|
|
size_t i = 0;
|
|
for (; i < rounded_len; i += 16) {
|
|
// Load 16 bytes from the address (input + i) into a xmm register
|
|
const __m128i latin1 =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i));
|
|
|
|
// Zero extend each byte in `in` to word
|
|
__m256i utf16 = _mm256_cvtepu8_epi16(latin1);
|
|
|
|
if (big_endian) {
|
|
const __m128i swap128 =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m256i swap = _mm256_set_m128i(swap128, swap128);
|
|
utf16 = _mm256_shuffle_epi8(utf16, swap);
|
|
}
|
|
|
|
// Store the contents of xmm1 into the address pointed by (output + i)
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output + i), utf16);
|
|
}
|
|
|
|
return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
|
|
}
|
|
/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
|
|
std::pair<const char *, char32_t *>
|
|
avx2_convert_latin1_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8
|
|
|
|
for (size_t i = 0; i < rounded_len; i += 8) {
|
|
// Load 8 Latin1 characters into a 64-bit register
|
|
__m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);
|
|
|
|
// Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
|
|
// vpmovzxbd
|
|
__m256i out = _mm256_cvtepu8_epi32(in);
|
|
|
|
// Store the results back to memory
|
|
_mm256_storeu_si256((__m256i *)&utf32_output[i], out);
|
|
}
|
|
|
|
// return pointers pointing to where we left off
|
|
return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
|
|
}
|
|
/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
__m256i ascii = _mm256_cvtepu8_epi16(in);
|
|
if (big_endian) {
|
|
const __m256i swap256 = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
ascii = _mm256_shuffle_epi8(ascii, swap256);
|
|
}
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
|
|
utf16_output += 12; // We wrote 12 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
|
|
// UTF-16 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian)
|
|
composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
|
|
// UTF-16 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian)
|
|
composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4;
|
|
return 12;
|
|
}
|
|
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small
|
|
// lookup table.
|
|
const __m128i sh = _mm_loadu_si128(
|
|
(const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian)
|
|
composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential
|
|
// overflow of 4 bytes.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
const __m128i sh = _mm_loadu_si128(
|
|
(const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian)
|
|
composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4; // Here we overflow by 8 bytes.
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-code units
|
|
//////////////
|
|
// There might be garbage inputs where a leading byte mascarades as a
|
|
// four-byte leading byte (by being followed by 3 continuation byte), but is
|
|
// not greater than 0xf0. This could trigger a buffer overflow if we only
|
|
// counted leading bytes of the form 0xf0 as generating surrogate pairs,
|
|
// without further UTF-8 validation. Thus we must be careful to ensure that
|
|
// only leading bytes at least as large as 0xf0 generate surrogate pairs. We
|
|
// do as at the cost of an extra mask.
|
|
/////////////
|
|
const __m128i sh = _mm_loadu_si128(
|
|
(const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
// We deliberately carry the leading four bits in highbyte if they are
|
|
// present, we remove them later when computing hightenbits.
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
// When we need to generate a surrogate pair (leading byte > 0xF0), then
|
|
// the corresponding 32-bit value in 'composed' will be greater than
|
|
// > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
|
|
// location of the surrogate pairs.
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
const __m128i composedminus =
|
|
_mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
|
|
const __m128i lowtenbits =
|
|
_mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
|
|
// Notice the 0x3ff mask:
|
|
const __m128i hightenbits =
|
|
_mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
|
|
const __m128i lowtenbitsadd =
|
|
_mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
|
|
const __m128i hightenbitsadd =
|
|
_mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
|
|
const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
|
|
__m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
|
|
uint32_t basic_buffer[4];
|
|
uint32_t basic_buffer_swap[4];
|
|
if (big_endian) {
|
|
_mm_storeu_si128((__m128i *)basic_buffer_swap,
|
|
_mm_shuffle_epi8(composed, swap));
|
|
surrogates = _mm_shuffle_epi8(surrogates, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i *)basic_buffer, composed);
|
|
uint32_t surrogate_buffer[4];
|
|
_mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
|
|
for (size_t i = 0; i < 3; i++) {
|
|
if (basic_buffer[i] > 0x3c00000) {
|
|
utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
|
|
utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
|
|
: uint16_t(basic_buffer[i]);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
|
|
_mm256_cvtepu8_epi32(in));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
|
|
_mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
|
|
utf32_output += 12; // We wrote 12 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
|
|
// UTF-32 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm256_storeu_si256((__m256i *)utf32_output,
|
|
_mm256_cvtepu16_epi32(composed));
|
|
utf32_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
|
|
// UTF-32 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small
|
|
// lookup table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm256_storeu_si256((__m256i *)utf32_output,
|
|
_mm256_cvtepu16_epi32(composed));
|
|
utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
|
|
// overflow of 32 - 24 = 8 bytes.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-code units
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output +=
|
|
3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 32) {
|
|
// Load 16 UTF-16 characters into 256-bit AVX2 register
|
|
__m256i in0 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
|
|
__m256i in1 =
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + 16));
|
|
|
|
if (!match_system(big_endian)) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in0 = _mm256_shuffle_epi8(in0, swap);
|
|
in1 = _mm256_shuffle_epi8(in1, swap);
|
|
}
|
|
|
|
__m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
|
|
if (_mm256_testz_si256(_mm256_or_si256(in0, in1), high_byte_mask)) {
|
|
// Pack 16-bit characters into 8-bit and store in latin1_output
|
|
const __m256i packed = _mm256_packus_epi16(in0, in1);
|
|
|
|
const __m256i result = _mm256_permute4x64_epi64(packed, 0b11011000);
|
|
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(latin1_output), result);
|
|
// Adjust pointers for the next iteration
|
|
buf += 32;
|
|
latin1_output += 32;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 16) {
|
|
__m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
|
|
|
|
if (!match_system(big_endian)) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
__m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
|
|
if (_mm256_testz_si256(in, high_byte_mask)) {
|
|
__m128i lo = _mm256_extractf128_si256(in, 0);
|
|
__m128i hi = _mm256_extractf128_si256(in, 1);
|
|
__m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
|
|
__m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
|
|
_mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
|
|
latin1_packed_lo);
|
|
_mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
|
|
latin1_packed_hi);
|
|
buf += 16;
|
|
latin1_output += 16;
|
|
} else {
|
|
// Fallback to scalar code for handling errors
|
|
for (int k = 0; k < 16; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(
|
|
result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
|
|
latin1_output);
|
|
}
|
|
}
|
|
buf += 16;
|
|
}
|
|
} // while
|
|
return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
|
|
latin1_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it is an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
|
|
const char16_t *end = buf + len;
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
|
|
if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(
|
|
_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked =
|
|
_mm256_blendv_epi8(t4, in, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t *row_2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
|
|
16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(
|
|
utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_extractf128_si256(utf8_packed, 1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i dup_even = _mm256_setr_epi16(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be
|
|
// useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m256i shuffle =
|
|
_mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
|
|
2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
|
|
_mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
|
|
_mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
|
|
const __m128i utf8_2 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
|
|
const __m128i utf8_3 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
|
|
if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(
|
|
_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked =
|
|
_mm256_blendv_epi8(t4, in, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t *row_2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
|
|
16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(
|
|
utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_extractf128_si256(utf8_packed, 1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i dup_even = _mm256_setr_epi16(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be
|
|
// useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m256i shuffle =
|
|
_mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
|
|
2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
|
|
_mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
|
|
_mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
|
|
const __m128i utf8_2 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
|
|
const __m128i utf8_3 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
utf8_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it is an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf32_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char32_t *>
|
|
avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char16_t *end = buf + len;
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
|
|
while (end - buf >= 16) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: we extend all sixteen 16-bit code units to sixteen 32-bit code
|
|
// units
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
|
|
_mm256_storeu_si256(
|
|
reinterpret_cast<__m256i *>(utf32_output + 8),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
// No surrogate pair
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr, utf32_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t *>
|
|
avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
|
|
while (end - buf >= 16) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(
|
|
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
|
|
21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: we extend all sixteen 16-bit code units to sixteen 32-bit code
|
|
// units
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
|
|
_mm256_storeu_si256(
|
|
reinterpret_cast<__m256i *>(utf32_output + 8),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
// No surrogate pair
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
utf32_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const size_t rounded_len =
|
|
len & ~0x1F; // Round down to nearest multiple of 32
|
|
|
|
const __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
|
|
|
|
for (size_t i = 0; i < rounded_len; i += 4 * 8) {
|
|
__m256i a = _mm256_loadu_si256((__m256i *)(buf + 0 * 8));
|
|
__m256i b = _mm256_loadu_si256((__m256i *)(buf + 1 * 8));
|
|
__m256i c = _mm256_loadu_si256((__m256i *)(buf + 2 * 8));
|
|
__m256i d = _mm256_loadu_si256((__m256i *)(buf + 3 * 8));
|
|
|
|
const __m256i check_combined =
|
|
_mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));
|
|
|
|
if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
|
|
return std::make_pair(nullptr, latin1_output);
|
|
}
|
|
|
|
b = _mm256_slli_epi32(b, 1 * 8);
|
|
c = _mm256_slli_epi32(c, 2 * 8);
|
|
d = _mm256_slli_epi32(d, 3 * 8);
|
|
|
|
// clang-format off
|
|
|
|
// a = [.. .. .. a7|.. .. .. a6|.. .. .. a5|.. .. .. a4||.. .. .. a3|.. .. .. a2|.. .. .. a1|.. .. .. a0]
|
|
// b = [.. .. b7 ..|.. .. b6 ..|.. .. b5 ..|.. .. b4 ..||.. .. b3 ..|.. .. b2 ..|.. .. b1 ..|.. .. b0 ..]
|
|
// c = [.. c7 .. ..|.. c6 .. ..|.. c5 .. ..|.. c4 .. ..||.. c3 .. ..|.. c2 .. ..|.. c1 .. ..|.. c0 .. ..]
|
|
// d = [d7 .. .. ..|d6 .. .. ..|d5 .. .. ..|d4 .. .. ..||d3 .. .. ..|d2 .. .. ..|d1 .. .. ..|d0 .. .. ..]
|
|
|
|
// t0 = [d7 c7 b7 a7|d6 c6 b6 a6|d5 c5 b5 a5|d4 c4 b4 a4||d3 c3 b3 a3|d2 c2 b2 a2|d1 c1 b1 a1|d0 c0 b0 a0]
|
|
const __m256i t0 =
|
|
_mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));
|
|
|
|
// shuffle bytes within 128-bit lanes
|
|
// t1 = [d7 d6 d5 d4|c7 c6 c5 c4|b7 b6 b5 b4|a7 a6 a5 a4||d3 d2 d1 d0|c3 c2 c1 c0|b3 b2 b1 b0|a3 a2 a1 a0]
|
|
const __m256i shuffle_bytes =
|
|
_mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
|
|
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
|
|
|
|
const __m256i t1 = _mm256_shuffle_epi8(t0, shuffle_bytes);
|
|
|
|
// reshuffle dwords
|
|
// t2 = [d7 d6 d5 d4|d3 d2 d1 d0|c7 c6 c5 c4|c3 c2 c1 c0||b7 b6 b5 b4|b3 b2 b1 b0|a7 a6 a5 a4|a3 a2 a1 a0]
|
|
const __m256i shuffle_dwords = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
|
|
const __m256i t2 = _mm256_permutevar8x32_epi32(t1, shuffle_dwords);
|
|
// clang format on
|
|
|
|
_mm256_storeu_si256((__m256i *)latin1_output, t2);
|
|
|
|
latin1_output += 32;
|
|
buf += 32;
|
|
}
|
|
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const size_t rounded_len =
|
|
len & ~0x1F; // Round down to nearest multiple of 32
|
|
|
|
const char32_t *start = buf;
|
|
|
|
const __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
|
|
|
|
for (size_t i = 0; i < rounded_len; i += 4 * 8) {
|
|
__m256i a = _mm256_loadu_si256((__m256i *)(buf + 0 * 8));
|
|
__m256i b = _mm256_loadu_si256((__m256i *)(buf + 1 * 8));
|
|
__m256i c = _mm256_loadu_si256((__m256i *)(buf + 2 * 8));
|
|
__m256i d = _mm256_loadu_si256((__m256i *)(buf + 3 * 8));
|
|
|
|
const __m256i check_combined =
|
|
_mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));
|
|
|
|
if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
|
|
// Fallback to scalar code for handling errors
|
|
for (int k = 0; k < 4 * 8; k++) {
|
|
char32_t codepoint = buf[k];
|
|
if (codepoint <= 0xFF) {
|
|
*latin1_output++ = static_cast<char>(codepoint);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
}
|
|
|
|
b = _mm256_slli_epi32(b, 1 * 8);
|
|
c = _mm256_slli_epi32(c, 2 * 8);
|
|
d = _mm256_slli_epi32(d, 3 * 8);
|
|
|
|
const __m256i t0 =
|
|
_mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));
|
|
|
|
const __m256i shuffle_bytes =
|
|
_mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
|
|
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
|
|
|
|
const __m256i t1 = _mm256_shuffle_epi8(t0, shuffle_bytes);
|
|
|
|
const __m256i shuffle_dwords = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
|
|
const __m256i t2 = _mm256_permutevar8x32_epi32(t1, shuffle_dwords);
|
|
|
|
_mm256_storeu_si256((__m256i *)latin1_output, t2);
|
|
|
|
latin1_output += 32;
|
|
buf += 32;
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
|
|
const char32_t *end = buf + len;
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
__m256i running_max = _mm256_setzero_si256();
|
|
__m256i forbidden_bytemask = _mm256_setzero_si256();
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
|
|
running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
|
|
|
|
// Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
|
|
// saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
|
|
_mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits
|
|
// (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(
|
|
_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked =
|
|
_mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t *row_2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
|
|
16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(
|
|
utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_extractf128_si256(utf8_packed, 1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
|
|
_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask = _mm256_or_si256(
|
|
forbidden_bytemask,
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be
|
|
// useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m256i shuffle =
|
|
_mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
|
|
2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
|
|
_mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
|
|
_mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
|
|
const __m128i utf8_2 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
|
|
const __m128i utf8_3 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will
|
|
// produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
|
|
// wasteful to use scalar code, but being efficient with SIMD may require
|
|
// large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) { // 2-byte
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
|
|
_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const char32_t *end = buf + len;
|
|
const char32_t *start = buf;
|
|
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
|
|
// Check for too large input
|
|
const __m256i max_input =
|
|
_mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(
|
|
_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
utf8_output);
|
|
}
|
|
|
|
// Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
|
|
// saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
|
|
_mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits
|
|
// (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(
|
|
_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked =
|
|
_mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t *row_2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
|
|
16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(
|
|
utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
_mm256_extractf128_si256(utf8_packed, 1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
|
|
_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// Check for illegal surrogate code units
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
const __m256i forbidden_bytemask =
|
|
_mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
|
|
0x0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
utf8_output);
|
|
}
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be
|
|
// useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m256i shuffle =
|
|
_mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
|
|
2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
|
|
_mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
|
|
_mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output,
|
|
_mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 =
|
|
_mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
|
|
const __m128i utf8_2 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
|
|
const __m128i utf8_3 =
|
|
_mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will
|
|
// produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
|
|
// wasteful to use scalar code, but being efficient with SIMD may require
|
|
// large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) { // 2-byte
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k), utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k), utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t *, char16_t *>
|
|
avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char32_t *end = buf + len;
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
__m256i forbidden_bytemask = _mm256_setzero_si256();
|
|
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
|
|
const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
|
|
|
|
while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
|
|
const __m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
|
|
if (simdutf_likely(_mm256_testz_si256(in, v_ffff0000))) {
|
|
// no bits set above 16th bit <=> can pack to UTF16
|
|
// without surrogate pairs
|
|
forbidden_bytemask = _mm256_or_si256(
|
|
forbidden_bytemask,
|
|
_mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
|
|
|
|
__m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
|
|
_mm256_extractf128_si256(in, 1));
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
*utf16_output++ =
|
|
big_endian
|
|
? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate =
|
|
uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate =
|
|
uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t *>
|
|
avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
|
|
const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
|
|
|
|
while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
|
|
const __m256i in = _mm256_loadu_si256((__m256i *)buf);
|
|
|
|
if (simdutf_likely(_mm256_testz_si256(in, v_ffff0000))) {
|
|
// no bits set above 16th bit <=> can pack to UTF16 without surrogate
|
|
// pairs
|
|
const __m256i forbidden_bytemask =
|
|
_mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
|
|
0x0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
utf16_output);
|
|
}
|
|
|
|
__m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
|
|
_mm256_extractf128_si256(in, 1));
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k), utf16_output);
|
|
}
|
|
*utf16_output++ =
|
|
big_endian
|
|
? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k), utf16_output);
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate =
|
|
uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate =
|
|
uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_latin1(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char *&latin1_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask &
|
|
0xfff; // we are only processing 12 bytes in case it is not all ASCII
|
|
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
|
|
latin1_output += 12; // We wrote 12 characters.
|
|
return 12; // We consumed 1 bytes.
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
// this indicates an invalid input:
|
|
if (idx >= 64) {
|
|
return consumed;
|
|
}
|
|
// Here we should have (idx < 64), if not, there is a bug in the validation or
|
|
// elsewhere. SIX (6) input code-code units this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small lookup
|
|
// table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
|
|
// writing 8 bytes even though we only care about the first 6 bytes.
|
|
// performance note: it would be faster to use _mm_storeu_si128, we should
|
|
// investigate.
|
|
_mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
|
|
latin1_output += 6; // We wrote 6 bytes.
|
|
return consumed;
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/haswell/avx2_base64.cpp */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
|
|
template <bool base64_url>
|
|
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
|
|
// credit: Wojciech Muła
|
|
__m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
|
|
const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
|
|
result =
|
|
_mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
|
|
__m256i shift_LUT;
|
|
if (base64_url) {
|
|
shift_LUT = _mm256_setr_epi8(
|
|
'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
|
|
|
|
'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
|
|
} else {
|
|
shift_LUT = _mm256_setr_epi8(
|
|
'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
|
|
|
|
'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
|
|
}
|
|
|
|
result = _mm256_shuffle_epi8(shift_LUT, result);
|
|
return _mm256_add_epi8(result, input);
|
|
}
|
|
|
|
template <bool isbase64url>
|
|
size_t encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
// credit: Wojciech Muła
|
|
const uint8_t *input = (const uint8_t *)src;
|
|
|
|
uint8_t *out = (uint8_t *)dst;
|
|
const __m256i shuf =
|
|
_mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
|
|
|
|
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
|
|
size_t i = 0;
|
|
for (; i + 100 <= srclen; i += 96) {
|
|
const __m128i lo0 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
|
|
const __m128i hi0 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
|
|
const __m128i lo1 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
|
|
const __m128i hi1 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
|
|
const __m128i lo2 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
|
|
const __m128i hi2 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
|
|
const __m128i lo3 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
|
|
const __m128i hi3 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
|
|
|
|
__m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
|
|
__m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
|
|
__m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
|
|
__m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
|
|
|
|
const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
|
|
const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
|
|
const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
|
|
const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
|
|
|
|
const __m256i t1_0 =
|
|
_mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
|
|
const __m256i t1_1 =
|
|
_mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
|
|
const __m256i t1_2 =
|
|
_mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
|
|
const __m256i t1_3 =
|
|
_mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
|
|
|
|
const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
|
|
const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
|
|
const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
|
|
const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
|
|
|
|
const __m256i t3_0 =
|
|
_mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
|
|
const __m256i t3_1 =
|
|
_mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
|
|
const __m256i t3_2 =
|
|
_mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
|
|
const __m256i t3_3 =
|
|
_mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
|
|
|
|
const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
|
|
const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
|
|
const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
|
|
const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
|
|
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input0));
|
|
out += 32;
|
|
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input1));
|
|
out += 32;
|
|
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input2));
|
|
out += 32;
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input3));
|
|
out += 32;
|
|
}
|
|
for (; i + 28 <= srclen; i += 24) {
|
|
// lo = [xxxx|DDDC|CCBB|BAAA]
|
|
// hi = [xxxx|HHHG|GGFF|FEEE]
|
|
const __m128i lo =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
|
|
const __m128i hi =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
|
|
|
|
// bytes from groups A, B and C are needed in separate 32-bit lanes
|
|
// in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
|
|
__m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
|
|
|
|
// this part is well commented in encode.sse.cpp
|
|
|
|
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
|
|
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
|
|
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
|
|
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
|
|
const __m256i indices = _mm256_or_si256(t1, t3);
|
|
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(indices));
|
|
out += 32;
|
|
}
|
|
return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
|
|
srclen - i, options);
|
|
}
|
|
|
|
static inline void compress(__m128i data, uint16_t mask, char *output) {
|
|
if (mask == 0) {
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
|
|
return;
|
|
}
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
|
|
__m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
|
|
tables::base64::thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = _mm_shuffle_epi8(data, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = tables::base64::BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
|
|
tables::base64::pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = _mm_shuffle_epi8(pruned, compactmask);
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
|
|
}
|
|
|
|
// --- decoding -----------------------------------------------
|
|
|
|
template <typename = void>
|
|
simdutf_really_inline void compress(__m256i data, uint32_t mask, char *output) {
|
|
if (mask == 0) {
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
|
|
return;
|
|
}
|
|
compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
|
|
compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
|
|
output + count_ones(~mask & 0xFFFF));
|
|
}
|
|
|
|
template <typename = void>
|
|
simdutf_really_inline void base64_decode(char *out, __m256i str) {
|
|
// credit: aqrit
|
|
const __m256i pack_shuffle =
|
|
_mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
|
|
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
|
|
const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
|
|
const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
|
|
const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
|
|
|
|
// Store the output:
|
|
_mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
|
|
_mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
|
|
}
|
|
|
|
template <typename = void>
|
|
simdutf_really_inline void base64_decode_block(char *out, const char *src) {
|
|
base64_decode(out,
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
|
|
base64_decode(out + 24, _mm256_loadu_si256(
|
|
reinterpret_cast<const __m256i *>(src + 32)));
|
|
}
|
|
|
|
template <typename = void>
|
|
simdutf_really_inline void base64_decode_block_safe(char *out,
|
|
const char *src) {
|
|
base64_decode(out,
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
|
|
char buffer[32]; // We enforce safety with a buffer.
|
|
base64_decode(
|
|
buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
|
|
std::memcpy(out + 24, buffer, 24);
|
|
}
|
|
|
|
// --- decoding - base64 class --------------------------------
|
|
|
|
class block64 {
|
|
__m256i chunks[2];
|
|
|
|
public:
|
|
// The caller of this function is responsible to ensure that there are 64
|
|
// bytes available from reading at src.
|
|
simdutf_really_inline block64(const char *src) {
|
|
chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
|
chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 128
|
|
// bytes available from reading at src.
|
|
simdutf_really_inline block64(const char16_t *src) {
|
|
const auto m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
|
const auto m2 =
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
|
|
const auto m3 =
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
|
|
const auto m4 =
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
|
|
|
|
const auto m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
|
|
const auto m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
|
|
const auto m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
|
|
const auto m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
|
|
|
|
chunks[0] = _mm256_packus_epi16(m1p, m2p);
|
|
chunks[1] = _mm256_packus_epi16(m3p, m4p);
|
|
}
|
|
|
|
simdutf_really_inline void copy_block(char *output) {
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(output), chunks[0]);
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), chunks[1]);
|
|
}
|
|
|
|
// decode 64 bytes and output 48 bytes
|
|
simdutf_really_inline void base64_decode_block(char *out) {
|
|
base64_decode(out, chunks[0]);
|
|
base64_decode(out + 24, chunks[1]);
|
|
}
|
|
|
|
simdutf_really_inline void base64_decode_block_safe(char *out) {
|
|
base64_decode(out, chunks[0]);
|
|
char buffer[32]; // We enforce safety with a buffer.
|
|
base64_decode(buffer, chunks[1]);
|
|
std::memcpy(out + 24, buffer, 24);
|
|
}
|
|
|
|
template <bool base64_url, bool ignore_garbage>
|
|
simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
|
|
uint32_t err0 = 0;
|
|
uint32_t err1 = 0;
|
|
uint64_t m0 = to_base64_mask<base64_url, ignore_garbage>(&chunks[0], &err0);
|
|
uint64_t m1 = to_base64_mask<base64_url, ignore_garbage>(&chunks[1], &err1);
|
|
if (!ignore_garbage) {
|
|
*error = err0 | ((uint64_t)err1 << 32);
|
|
}
|
|
return m0 | (m1 << 32);
|
|
}
|
|
|
|
template <bool base64_url, bool ignore_garbage>
|
|
simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
|
|
const __m256i ascii_space_tbl =
|
|
_mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
|
|
0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
|
|
// credit: aqrit
|
|
__m256i delta_asso;
|
|
if (base64_url) {
|
|
delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
|
|
0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0xF, 0x0, 0xF);
|
|
} else {
|
|
delta_asso = _mm256_setr_epi8(
|
|
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
|
|
0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
|
|
}
|
|
|
|
__m256i delta_values;
|
|
if (base64_url) {
|
|
delta_values = _mm256_setr_epi8(
|
|
0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
|
|
uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
|
|
uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
|
|
uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
|
|
uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
|
|
} else {
|
|
delta_values = _mm256_setr_epi8(
|
|
int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
|
|
int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
|
|
int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
|
|
int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
|
|
int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
|
|
int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
|
|
int8_t(0xB9), int8_t(0xB9));
|
|
}
|
|
|
|
__m256i check_asso;
|
|
if (base64_url) {
|
|
check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
|
|
0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
|
|
0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
|
|
0x7, 0xB, 0xE, 0xB, 0x6);
|
|
} else {
|
|
check_asso = _mm256_setr_epi8(
|
|
0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
|
|
0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
|
|
}
|
|
__m256i check_values;
|
|
if (base64_url) {
|
|
check_values = _mm256_setr_epi8(
|
|
uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
|
|
uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
|
|
uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
|
|
0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
|
|
uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
|
|
uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
|
|
uint8_t(0x80), 0x0, uint8_t(0x80));
|
|
} else {
|
|
check_values = _mm256_setr_epi8(
|
|
int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
|
|
int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
|
|
int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
|
|
int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
|
|
int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
|
|
int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
|
|
int8_t(0x91), int8_t(0x80));
|
|
}
|
|
const __m256i shifted = _mm256_srli_epi32(*src, 3);
|
|
const __m256i delta_hash =
|
|
_mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
|
|
const __m256i check_hash =
|
|
_mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
|
|
const __m256i out =
|
|
_mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
|
|
const __m256i chk =
|
|
_mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
|
|
const int mask = _mm256_movemask_epi8(chk);
|
|
if (!ignore_garbage && mask) {
|
|
__m256i ascii_space =
|
|
_mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
|
|
*error = (mask ^ _mm256_movemask_epi8(ascii_space));
|
|
}
|
|
*src = out;
|
|
return (uint32_t)mask;
|
|
}
|
|
|
|
simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
|
|
if (is_power_of_two(mask)) {
|
|
return compress_block_single(mask, output);
|
|
}
|
|
|
|
uint64_t nmask = ~mask;
|
|
compress(chunks[0], uint32_t(mask), output);
|
|
compress(chunks[1], uint32_t(mask >> 32),
|
|
output + count_ones(nmask & 0xFFFFFFFF));
|
|
return count_ones(nmask);
|
|
}
|
|
|
|
simdutf_really_inline size_t compress_block_single(uint64_t mask,
|
|
char *output) {
|
|
const size_t pos64 = trailing_zeroes(mask);
|
|
const int8_t pos = pos64 & 0xf;
|
|
switch (pos64 >> 4) {
|
|
case 0b00: {
|
|
const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
|
|
const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);
|
|
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(lane0, sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 0 * 16), compressed);
|
|
_mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), lane1);
|
|
_mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
|
|
} break;
|
|
case 0b01: {
|
|
const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
|
|
const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);
|
|
_mm_storeu_si128((__m128i *)(output + 0 * 16), lane0);
|
|
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(lane1, sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 1 * 16), compressed);
|
|
_mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
|
|
} break;
|
|
case 0b10: {
|
|
const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
|
|
const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);
|
|
|
|
_mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);
|
|
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(lane2, sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 2 * 16), compressed);
|
|
_mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), lane3);
|
|
} break;
|
|
case 0b11: {
|
|
const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
|
|
const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);
|
|
|
|
_mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);
|
|
_mm_storeu_si128((__m128i *)(output + 2 * 16), lane2);
|
|
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(lane3, sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 3 * 16), compressed);
|
|
} break;
|
|
}
|
|
|
|
return 63;
|
|
}
|
|
};
|
|
/* end file src/haswell/avx2_base64.cpp */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with
|
|
// spaces
|
|
template <size_t STEP_SIZE> struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0
|
|
* (in which case this function fills the buffer with spaces and returns 0. In
|
|
* particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
|
|
* block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text_64(const uint8_t *text) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t *>(buf));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') {
|
|
buf[i] = '_';
|
|
}
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char *format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
|
|
for (size_t i = 0; i < 64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline
|
|
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
|
|
: buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
|
|
idx{0} {}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
|
|
return idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *
|
|
buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t
|
|
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if (len == idx) {
|
|
return 0;
|
|
} // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20,
|
|
STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
|
|
// to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the
|
|
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they
|
|
// ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
0b11110000u - 1,
|
|
0b11100000u - 1,
|
|
0b11000000u - 1};
|
|
const simd8<uint8_t> max_value(
|
|
&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast
|
|
// path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is
|
|
// too short or a byte value too large in the last bytes: check_special_cases
|
|
// only checks for bytes too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an
|
|
// ASCII block can't possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
|
|
if (simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete =
|
|
is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template <class checker>
|
|
bool generic_validate_utf8(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char *input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template <class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char *input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/generic/ascii_validation.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace ascii_validation {
|
|
|
|
bool generic_validate_ascii(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
} // namespace ascii_validation
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/ascii_validation.h */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
// transcoding from UTF-8 to UTF-16
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char16_t *utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the
|
|
// generic directory.
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the
|
|
// mask far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow
|
|
// path. Anything that is not a continuation mask is a 'leading byte',
|
|
// that is, the start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end*
|
|
// of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
input + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
|
|
input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(
|
|
in + pos, size - pos, utf16_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 2;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
|
|
size_t iterations = 0;
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + N <= size; pos += N) {
|
|
const auto input =
|
|
vector_i8::load(reinterpret_cast<const int8_t *>(in + pos));
|
|
|
|
const auto continuation = input > int8_t(-65);
|
|
const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240);
|
|
|
|
local -= vector_u8(continuation);
|
|
local -= vector_u8(utf_4bytes);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
// transcoding from UTF-8 to UTF-32
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char32_t *utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while (pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
input + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
|
|
utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // we have an error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
/* begin file src/generic/utf32.h */
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
template <typename T> T min(T a, T b) { return a <= b ? a : b; }
|
|
|
|
simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
|
|
size_t length) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const char32_t *start = input;
|
|
|
|
// we add up to three ones in a single iteration (see the vectorized loop in
|
|
// section #2 below)
|
|
const size_t max_increment = 3;
|
|
|
|
const size_t N = vector_u32::ELEMENTS;
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
const auto v_0000007f = vector_u32::splat(0x0000007f);
|
|
const auto v_000007ff = vector_u32::splat(0x000007ff);
|
|
const auto v_0000ffff = vector_u32::splat(0x0000ffff);
|
|
#else
|
|
const auto v_ffffff80 = vector_u32::splat(0xffffff80);
|
|
const auto v_fffff800 = vector_u32::splat(0xfffff800);
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
const auto one = vector_u32::splat(1);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
size_t counter = 0;
|
|
|
|
// 1. vectorized loop unrolled 4 times
|
|
{
|
|
// we use vector of uint32 counters, this is why this limit is used
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / (max_increment * 4);
|
|
size_t blocks = length / (N * 4);
|
|
length -= blocks * (N * 4);
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
simd32<uint32_t> acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in0 = vector_u32(input + 0 * N);
|
|
const auto in1 = vector_u32(input + 1 * N);
|
|
const auto in2 = vector_u32(input + 2 * N);
|
|
const auto in3 = vector_u32(input + 3 * N);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in0 > v_0000007f);
|
|
acc -= as_vector_u32(in1 > v_0000007f);
|
|
acc -= as_vector_u32(in2 > v_0000007f);
|
|
acc -= as_vector_u32(in3 > v_0000007f);
|
|
|
|
acc -= as_vector_u32(in0 > v_000007ff);
|
|
acc -= as_vector_u32(in1 > v_000007ff);
|
|
acc -= as_vector_u32(in2 > v_000007ff);
|
|
acc -= as_vector_u32(in3 > v_000007ff);
|
|
|
|
acc -= as_vector_u32(in0 > v_0000ffff);
|
|
acc -= as_vector_u32(in1 > v_0000ffff);
|
|
acc -= as_vector_u32(in2 > v_0000ffff);
|
|
acc -= as_vector_u32(in3 > v_0000ffff);
|
|
#else
|
|
acc += min(one, in0 & v_ffffff80);
|
|
acc += min(one, in1 & v_ffffff80);
|
|
acc += min(one, in2 & v_ffffff80);
|
|
acc += min(one, in3 & v_ffffff80);
|
|
|
|
acc += min(one, in0 & v_fffff800);
|
|
acc += min(one, in1 & v_fffff800);
|
|
acc += min(one, in2 & v_fffff800);
|
|
acc += min(one, in3 & v_fffff800);
|
|
|
|
acc += min(one, in0 & v_ffff0000);
|
|
acc += min(one, in1 & v_ffff0000);
|
|
acc += min(one, in2 & v_ffff0000);
|
|
acc += min(one, in3 & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += 4 * N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
// 2. vectorized loop for tail
|
|
{
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / max_increment;
|
|
size_t blocks = length / N;
|
|
length -= blocks * N;
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
auto acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in = vector_u32(input);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in > v_0000007f);
|
|
acc -= as_vector_u32(in > v_000007ff);
|
|
acc -= as_vector_u32(in > v_0000ffff);
|
|
#else
|
|
acc += min(one, in & v_ffffff80);
|
|
acc += min(one, in & v_fffff800);
|
|
acc += min(one, in & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
if (consumed != 0) {
|
|
// We don't count 0th bytes in the vectorized loops above, this
|
|
// is why we need to count them in the end.
|
|
counter += consumed;
|
|
}
|
|
|
|
return counter + scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
// other functions
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/generic/utf8.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.gt(-65);
|
|
count += count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 4;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
size_t iterations = 0;
|
|
for (; pos + 4 * N <= size; pos += 4 * N) {
|
|
const auto input0 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
|
|
const auto input1 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
|
|
const auto input2 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
|
|
const auto input3 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
|
|
const auto mask0 = input0 > int8_t(-65);
|
|
const auto mask1 = input1 > int8_t(-65);
|
|
const auto mask2 = input2 > int8_t(-65);
|
|
const auto mask3 = input3 > int8_t(-65);
|
|
|
|
local -= vector_u8(mask0);
|
|
local -= vector_u8(mask1);
|
|
local -= vector_u8(mask2);
|
|
local -= vector_u8(mask3);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
#endif // SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count +
|
|
scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
|
|
ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void
|
|
change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos < size / 32 * 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
|
|
using vector_u16 = simd16<uint16_t>;
|
|
constexpr size_t N = vector_u16::ELEMENTS;
|
|
|
|
const auto one = vector_u16::splat(1);
|
|
|
|
auto v_count = vector_u16::zero();
|
|
|
|
// each char16 yields at least one byte
|
|
size_t count = size / N * N;
|
|
|
|
// in a single iteration the increment is 0, 1 or 2, despite we have
|
|
// three additions
|
|
constexpr size_t max_iterations = 65535 / 2;
|
|
size_t iteration = max_iterations;
|
|
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vector_u16::load(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input = input.swap_bytes();
|
|
}
|
|
// 0xd800 .. 0xdbff - low surrogate
|
|
// 0xdc00 .. 0xdfff - high surrogate
|
|
const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
|
|
|
|
// c0 - chars that yield 2- or 3-byte UTF-8 codes
|
|
const auto c0 = min(input & uint16_t(0xff80), one);
|
|
|
|
// c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
|
|
const auto c1 = min(input & uint16_t(0xf800), one);
|
|
|
|
/*
|
|
Explanation how the counting works.
|
|
|
|
In the case of a non-surrogate character we count:
|
|
* always 1 -- see how `count` is initialized above;
|
|
* c0 = 1 if the current char yields 2 or 3 bytes;
|
|
* c1 = 1 if the current char yields 3 bytes.
|
|
|
|
Thus, we always have correct count for the current char:
|
|
from 1, 2 or 3 bytes.
|
|
|
|
A trickier part is how we count surrogate pairs. Whether
|
|
we encounter a surrogate (low or high), we count it as
|
|
3 chars and then minus 1 (`is_surrogate` is -1 or 0).
|
|
Each surrogate char yields 2. A surrogate pair, that
|
|
is a low surrogate followed by a high one, yields
|
|
the expected 4 bytes.
|
|
|
|
It also correctly handles cases when low surrogate is
|
|
processed by the this loop, but high surrogate is counted
|
|
by the scalar procedure. The scalar procedure uses exactly
|
|
the described approach, thanks to that for valid UTF-16
|
|
strings it always count correctly.
|
|
*/
|
|
v_count += c0;
|
|
v_count += c1;
|
|
v_count += vector_u16(is_surrogate);
|
|
|
|
iteration -= 1;
|
|
if (iteration == 0) {
|
|
count += v_count.sum();
|
|
v_count = vector_u16::zero();
|
|
iteration = max_iterations;
|
|
}
|
|
}
|
|
|
|
if (iteration > 0) {
|
|
count += v_count.sum();
|
|
}
|
|
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/validate_utf16.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf16 {
|
|
/*
|
|
UTF-16 validation
|
|
--------------------------------------------------
|
|
|
|
In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
|
|
|
|
In a vectorized algorithm we want to examine the most significant
|
|
nibble in order to select a fast path. If none of highest nibbles
|
|
are 0xD (13), than we are sure that UTF-16 chunk in a vector
|
|
register is valid.
|
|
|
|
Let us analyze what we need to check if the nibble is 0xD. The
|
|
value of the preceding nibble determines what we have:
|
|
|
|
0xd000 .. 0xd7ff - a valid word
|
|
0xd800 .. 0xdbff - low surrogate
|
|
0xdc00 .. 0xdfff - high surrogate
|
|
|
|
Other constraints we have to consider:
|
|
- there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
|
|
- there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
|
|
- there must not be sole low surrogate nor high surrogate
|
|
|
|
We are going to build three bitmasks based on the 3rd nibble:
|
|
- V = valid word,
|
|
- L = low surrogate (0xd800 .. 0xdbff)
|
|
- H = high surrogate (0xdc00 .. 0xdfff)
|
|
|
|
0 1 2 3 4 5 6 7 <--- word index
|
|
[ V | L | H | L | H | V | V | L ]
|
|
1 0 0 0 0 1 1 0 - V = valid masks
|
|
0 1 0 1 0 0 0 1 - L = low surrogate
|
|
0 0 1 0 1 0 0 0 - H high surrogate
|
|
|
|
|
|
1 0 0 0 0 1 1 0 V = valid masks
|
|
0 1 0 1 0 0 0 0 a = L & (H >> 1)
|
|
0 0 1 0 1 0 0 0 b = a << 1
|
|
1 1 1 1 1 1 1 0 c = V | a | b
|
|
^
|
|
the last bit can be zero, we just consume 7
|
|
code units and recheck this word in the next iteration
|
|
*/
|
|
template <endianness big_endian>
|
|
const result validate_utf16_with_errors(const char16_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char16_t *start = input;
|
|
const char16_t *end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 =
|
|
simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
// Function `utf16_gather_high_bytes` consumes two vectors of UTF-16
|
|
// and yields a single vector having only higher bytes of characters.
|
|
const auto in = utf16_gather_high_bytes<big_endian>(in0, in1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0000) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher byte)
|
|
|
|
// V - non-surrogate code units
|
|
// V = not surrogates_wordmask
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(
|
|
L & (H >> 1)); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint16_t b = static_cast<uint16_t>(
|
|
a << 1); // Just mark that the opinput - startite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint16_t c = static_cast<uint16_t>(
|
|
V | a | b); // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single code units or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
// The 15 lower code units of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
// transcoding from UTF-8 to Latin 1
|
|
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// For UTF-8 to Latin 1, we can allow any ASCII character, and any
|
|
// continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
|
|
// 0b11000010 and nothing else.
|
|
//
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
constexpr const uint8_t FORBIDDEN = 0xff;
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
FORBIDDEN);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
FORBIDDEN,
|
|
// ____0101 ________
|
|
FORBIDDEN,
|
|
// ____011_ ________
|
|
FORBIDDEN, FORBIDDEN,
|
|
|
|
// ____1___ ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
|
|
// ____1101 ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
this->error |= check_special_cases(input, prev1);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 16; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
latin1_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin1_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t convert_valid(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
|
|
// 16 bytes, and if the data is valid, then it is entirely safe because 16
|
|
// UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
|
|
// assume that you have valid UTF-8 input, so we are going to go back from the
|
|
// end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
|
|
latin1_output);
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
// namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/validate_utf32.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
simdutf_really_inline bool validate(const char32_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
// empty input is valid UTF-32. protect the implementation from
|
|
// handling nullptr
|
|
return true;
|
|
}
|
|
|
|
const char32_t *end = input + size;
|
|
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const auto standardmax = vector_u32::splat(0x10ffff);
|
|
const auto offset = vector_u32::splat(0xffff2000);
|
|
const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
|
|
auto currentmax = vector_u32::zero();
|
|
auto currentoffsetmax = vector_u32::zero();
|
|
|
|
constexpr size_t N = vector_u32::ELEMENTS;
|
|
|
|
while (input + N < end) {
|
|
auto in = vector_u32(input);
|
|
if (!match_system(endianness::BIG)) {
|
|
in.swap_bytes();
|
|
}
|
|
|
|
currentmax = max(currentmax, in);
|
|
currentoffsetmax = max(currentoffsetmax, in + offset);
|
|
input += N;
|
|
}
|
|
|
|
const auto too_large = currentmax > standardmax;
|
|
if (too_large.any()) {
|
|
return false;
|
|
}
|
|
|
|
const auto surrogate = currentoffsetmax > standardoffsetmax;
|
|
if (surrogate.any()) {
|
|
return false;
|
|
}
|
|
|
|
return scalar::utf32::validate(input, end - input);
|
|
}
|
|
|
|
simdutf_really_inline result validate_with_errors(const char32_t *input,
|
|
size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
// empty input is valid UTF-32. protect the implementation from
|
|
// handling nullptr
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char32_t *start = input;
|
|
const char32_t *end = input + size;
|
|
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const auto standardmax = vector_u32::splat(0x10ffff + 1);
|
|
const auto surrogate_mask = vector_u32::splat(0xfffff800);
|
|
const auto surrogate_byte = vector_u32::splat(0x0000d800);
|
|
|
|
constexpr size_t N = vector_u32::ELEMENTS;
|
|
|
|
while (input + N < end) {
|
|
auto in = vector_u32(input);
|
|
if (!match_system(endianness::BIG)) {
|
|
in.swap_bytes();
|
|
}
|
|
|
|
const auto too_large = in >= standardmax;
|
|
const auto surrogate = (in & surrogate_mask) == surrogate_byte;
|
|
|
|
const auto combined = too_large | surrogate;
|
|
if (simdutf_unlikely(combined.any())) {
|
|
const size_t consumed = input - start;
|
|
auto sr = scalar::utf32::validate_with_errors(input, end - input);
|
|
sr.count += consumed;
|
|
|
|
return sr;
|
|
}
|
|
|
|
input += N;
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
auto sr = scalar::utf32::validate_with_errors(input, end - input);
|
|
sr.count += consumed;
|
|
|
|
return sr;
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/generic/base64.h */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace base64 {
|
|
|
|
/*
|
|
The following template function implements API for Base64 decoding.
|
|
|
|
An implementation is responsible for providing the `block64` type and
|
|
associated methods that perform actual conversion. Please refer
|
|
to any vectorized implementation to learn the API of these procedures.
|
|
*/
|
|
template <bool base64_url, bool ignore_garbage, typename chartype>
|
|
full_result
|
|
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
size_t equallocation =
|
|
srclen; // location of the first padding character if any
|
|
// skip trailing spaces
|
|
while (!ignore_garbage && srclen > 0 &&
|
|
scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
size_t equalsigns = 0;
|
|
if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 1;
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 2;
|
|
}
|
|
}
|
|
if (srclen == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
char *end_of_safe_64byte_zone =
|
|
(srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
|
|
|
|
const chartype *const srcinit = src;
|
|
const char *const dstinit = dst;
|
|
const chartype *const srcend = src + srclen;
|
|
|
|
constexpr size_t block_size = 6;
|
|
static_assert(block_size >= 2, "block_size must be at least two");
|
|
char buffer[block_size * 64];
|
|
char *bufferptr = buffer;
|
|
if (srclen >= 64) {
|
|
const chartype *const srcend64 = src + srclen - 64;
|
|
while (src <= srcend64) {
|
|
block64 b(src);
|
|
src += 64;
|
|
uint64_t error = 0;
|
|
const uint64_t badcharmask =
|
|
b.to_base64_mask<base64_url, ignore_garbage>(&error);
|
|
if (!ignore_garbage && error) {
|
|
src -= 64;
|
|
const size_t error_offset = trailing_zeroes(error);
|
|
return {error_code::INVALID_BASE64_CHARACTER,
|
|
size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
|
|
}
|
|
if (badcharmask != 0) {
|
|
bufferptr += b.compress_block(badcharmask, bufferptr);
|
|
} else if (bufferptr != buffer) {
|
|
b.copy_block(bufferptr);
|
|
bufferptr += 64;
|
|
} else {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
b.base64_decode_block_safe(dst);
|
|
} else {
|
|
b.base64_decode_block(dst);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if (bufferptr >= (block_size - 1) * 64 + buffer) {
|
|
for (size_t i = 0; i < (block_size - 2); i++) {
|
|
base64_decode_block(dst, buffer + i * 64);
|
|
dst += 48;
|
|
}
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
|
|
} else {
|
|
base64_decode_block(dst, buffer + (block_size - 2) * 64);
|
|
}
|
|
dst += 48;
|
|
std::memcpy(buffer, buffer + (block_size - 1) * 64,
|
|
64); // 64 might be too much
|
|
bufferptr -= (block_size - 1) * 64;
|
|
}
|
|
}
|
|
}
|
|
|
|
char *buffer_start = buffer;
|
|
// Optimization note: if this is almost full, then it is worth our
|
|
// time, otherwise, we should just decode directly.
|
|
int last_block = (int)((bufferptr - buffer_start) % 64);
|
|
if (last_block != 0 && srcend - src + last_block >= 64) {
|
|
|
|
while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
|
|
uint8_t val = to_base64[uint8_t(*src)];
|
|
*bufferptr = char(val);
|
|
if (!ignore_garbage &&
|
|
(!scalar::base64::is_eight_byte(*src) || val > 64)) {
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
bufferptr += (val <= 63);
|
|
src++;
|
|
}
|
|
}
|
|
|
|
for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer_start);
|
|
} else {
|
|
base64_decode_block(dst, buffer_start);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if ((bufferptr - buffer_start) % 64 != 0) {
|
|
while (buffer_start + 4 < bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
#endif
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
if (buffer_start + 4 <= bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
#endif
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
// we may have 1, 2 or 3 bytes left and we need to decode them so let us
|
|
// backtrack
|
|
int leftover = int(bufferptr - buffer_start);
|
|
while (leftover > 0) {
|
|
if (!ignore_garbage) {
|
|
while (to_base64[uint8_t(*(src - 1))] == 64) {
|
|
src--;
|
|
}
|
|
} else {
|
|
while (to_base64[uint8_t(*(src - 1))] >= 64) {
|
|
src--;
|
|
}
|
|
}
|
|
src--;
|
|
leftover--;
|
|
}
|
|
}
|
|
if (src < srcend + equalsigns) {
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
dst, src, srcend - src, equalsigns, options, last_chunk_options);
|
|
r.input_count += size_t(src - srcinit);
|
|
if (r.error == error_code::INVALID_BASE64_CHARACTER ||
|
|
r.error == error_code::BASE64_EXTRA_BITS) {
|
|
return r;
|
|
} else {
|
|
r.output_count += size_t(dst - dstinit);
|
|
}
|
|
if (!ignore_garbage && last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
r.error = error_code::INVALID_BASE64_CHARACTER;
|
|
r.input_count = equallocation;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if ((size_t(dst - dstinit) % 3 == 0) ||
|
|
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
|
|
} // namespace base64
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/base64.h */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
|
|
int out = 0;
|
|
uint32_t utf16_err = (length % 2);
|
|
uint32_t utf32_err = (length % 4);
|
|
uint32_t ends_with_high = 0;
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
|
|
const __m256i offset = _mm256_set1_epi32(0xffff2000);
|
|
const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
|
|
__m256i currentmax = _mm256_setzero_si256();
|
|
__m256i currentoffsetmax = _mm256_setzero_si256();
|
|
|
|
utf8_checker c{};
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
// utf8 checks
|
|
c.check_next_input(in);
|
|
|
|
// utf16le checks
|
|
auto in0 = simd16<uint16_t>(in.chunks[0]);
|
|
auto in1 = simd16<uint16_t>(in.chunks[1]);
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const auto in2 = simd16<uint16_t>::pack(t0, t1);
|
|
const auto surrogates_wordmask = (in2 & v_f8) == v_d8;
|
|
const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
|
|
const auto vL = (in2 & v_fc) == v_dc;
|
|
const uint32_t L = vL.to_bitmask();
|
|
const uint32_t H = L ^ surrogates_bitmask;
|
|
utf16_err |= (((H << 1) | ends_with_high) != L);
|
|
ends_with_high = (H & 0x80000000) != 0;
|
|
|
|
// utf32le checks
|
|
currentmax = _mm256_max_epu32(in.chunks[0], currentmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[0], offset),
|
|
currentoffsetmax);
|
|
currentmax = _mm256_max_epu32(in.chunks[1], currentmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[1], offset),
|
|
currentoffsetmax);
|
|
|
|
reader.advance();
|
|
}
|
|
|
|
uint8_t block[64]{};
|
|
size_t idx = reader.block_index();
|
|
std::memcpy(block, &input[idx], length - idx);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
|
|
// utf16le last block check
|
|
auto in0 = simd16<uint16_t>(in.chunks[0]);
|
|
auto in1 = simd16<uint16_t>(in.chunks[1]);
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const auto in2 = simd16<uint16_t>::pack(t0, t1);
|
|
const auto surrogates_wordmask = (in2 & v_f8) == v_d8;
|
|
const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
|
|
const auto vL = (in2 & v_fc) == v_dc;
|
|
const uint32_t L = vL.to_bitmask();
|
|
const uint32_t H = L ^ surrogates_bitmask;
|
|
utf16_err |= (((H << 1) | ends_with_high) != L);
|
|
// this is required to check for last byte ending in high and end of input
|
|
// is reached
|
|
ends_with_high = (H & 0x80000000) != 0;
|
|
utf16_err |= ends_with_high;
|
|
|
|
// utf32le last block check
|
|
currentmax = _mm256_max_epu32(in.chunks[0], currentmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[0], offset),
|
|
currentoffsetmax);
|
|
currentmax = _mm256_max_epu32(in.chunks[1], currentmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[1], offset),
|
|
currentoffsetmax);
|
|
|
|
reader.advance();
|
|
|
|
c.check_eof();
|
|
bool is_valid_utf8 = !c.errors();
|
|
__m256i is_zero =
|
|
_mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
|
|
utf32_err |= (_mm256_testz_si256(is_zero, is_zero) == 0);
|
|
|
|
is_zero = _mm256_xor_si256(
|
|
_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
utf32_err |= (_mm256_testz_si256(is_zero, is_zero) == 0);
|
|
if (is_valid_utf8) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if (utf16_err == 0) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
if (utf32_err == 0) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return haswell::utf8_validation::generic_validate_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return haswell::ascii_validation::generic_validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return haswell::ascii_validation::generic_validate_ascii_with_errors(buf,
|
|
len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid UTF-16. protect the implementation from
|
|
// handling nullptr
|
|
return true;
|
|
}
|
|
const auto res =
|
|
haswell::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count == len) {
|
|
return true;
|
|
}
|
|
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid UTF-16. protect the implementation from
|
|
// handling nullptr
|
|
return true;
|
|
}
|
|
const auto res =
|
|
haswell::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count == len) {
|
|
return true;
|
|
}
|
|
|
|
return scalar::utf16::validate<endianness::BIG>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
|
|
const result res =
|
|
haswell::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
const result scalar_res =
|
|
scalar::utf16::validate_with_errors<endianness::LITTLE>(
|
|
buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
const result res =
|
|
haswell::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
const result scalar_res =
|
|
scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count,
|
|
len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_avx<endianness::LITTLE>(input, len, output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_avx<endianness::BIG>(input, len, output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
return utf32::validate(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
return utf32::validate_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char *, char *> ret =
|
|
avx2_convert_latin1_to_utf8(buf, len, utf8_output);
|
|
size_t converted_chars = ret.second - utf8_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_converted_chars == 0) {
|
|
return 0;
|
|
}
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_converted_chars == 0) {
|
|
return 0;
|
|
}
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char *, char32_t *> ret =
|
|
avx2_convert_latin1_to_utf32(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t converted_chars = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_converted_chars == 0) {
|
|
return 0;
|
|
}
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *input, size_t size, char *latin1_output) const noexcept {
|
|
return utf8_to_latin1::convert_valid(input, size, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *input, size_t size, char32_t *utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
|
|
latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
|
|
buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function
|
|
return convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function
|
|
return convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len,
|
|
utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len,
|
|
utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
|
|
buf, len, utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(
|
|
buf, len, utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
avx2_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
avx2_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return convert_utf32_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len,
|
|
utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
|
|
buf, len, utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(
|
|
buf, len, utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
|
|
buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(
|
|
buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *in, size_t size) const noexcept {
|
|
return utf8::count_code_points_bytemask(in, size);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::LITTLE>(input,
|
|
length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8_bytemask(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t len) const noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
|
|
size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
|
|
size_t i = 0;
|
|
if (answer >= 2048) { // long strings optimization
|
|
__m256i four_64bits = _mm256_setzero_si256();
|
|
while (i + sizeof(__m256i) <= len) {
|
|
__m256i runner = _mm256_setzero_si256();
|
|
// We can do up to 255 loops without overflow.
|
|
size_t iterations = (len - i) / sizeof(__m256i);
|
|
if (iterations > 255) {
|
|
iterations = 255;
|
|
}
|
|
size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
|
|
for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
|
|
__m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
|
|
__m256i input2 =
|
|
_mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
|
|
__m256i input3 = _mm256_loadu_si256(
|
|
(const __m256i *)(data + i + 2 * sizeof(__m256i)));
|
|
__m256i input4 = _mm256_loadu_si256(
|
|
(const __m256i *)(data + i + 3 * sizeof(__m256i)));
|
|
__m256i input12 =
|
|
_mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
|
|
_mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
|
|
__m256i input23 =
|
|
_mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
|
|
_mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
|
|
__m256i input1234 = _mm256_add_epi8(input12, input23);
|
|
runner = _mm256_sub_epi8(runner, input1234);
|
|
}
|
|
for (; i <= max_i; i += sizeof(__m256i)) {
|
|
__m256i input_256_chunk =
|
|
_mm256_loadu_si256((const __m256i *)(data + i));
|
|
runner = _mm256_sub_epi8(
|
|
runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
|
|
}
|
|
four_64bits = _mm256_add_epi64(
|
|
four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
|
|
}
|
|
answer += _mm256_extract_epi64(four_64bits, 0) +
|
|
_mm256_extract_epi64(four_64bits, 1) +
|
|
_mm256_extract_epi64(four_64bits, 2) +
|
|
_mm256_extract_epi64(four_64bits, 3);
|
|
} else if (answer > 0) {
|
|
for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
|
|
__m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
|
|
uint32_t non_ascii = _mm256_movemask_epi8(latin);
|
|
answer += count_ones(non_ascii);
|
|
}
|
|
}
|
|
return answer + scalar::latin1::utf8_length_from_latin1(
|
|
reinterpret_cast<const char *>(data + i), len - i);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
const __m256i v_00000000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 8 <= length; pos += 8) {
|
|
__m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
|
|
const __m256i surrogate_bytemask =
|
|
_mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
|
|
const uint32_t surrogate_bitmask =
|
|
static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
|
|
size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
|
|
count += 8 + surrogate_count;
|
|
}
|
|
return count +
|
|
scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
if (options & base64_url) {
|
|
return encode_base64<true>(output, input, length, options);
|
|
} else {
|
|
return encode_base64<false>(output, input, length, options);
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/haswell/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
#undef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for
|
|
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/end.h */
|
|
/* end file src/haswell/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
/* begin file src/ppc64/implementation.cpp */
|
|
/* begin file src/simdutf/ppc64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
|
|
// #define SIMDUTF_IMPLEMENTATION ppc64
|
|
/* end file src/simdutf/ppc64/begin.h */
|
|
|
|
/* begin file src/ppc64/ppc64_utf16_to_utf8_tables.h */
|
|
// Code generated automatically; DO NOT EDIT
|
|
// file generated by scripts/ppc64_convert_utf16_to_utf8.py
|
|
#ifndef PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
#define PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
|
|
namespace simdutf {
|
|
namespace {
|
|
namespace tables {
|
|
namespace ppc64_utf16_to_utf8 {
|
|
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
// 1 byte for length, 16 bytes for mask
|
|
const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
|
|
{12, 1, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 17, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 1, 0, 16, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{11, 1, 0, 16, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 0, 16, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 17, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 1, 0, 16, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 16, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 17, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 0, 16, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 1, 0, 16, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 1, 0, 16, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 0, 16, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{11, 1, 0, 16, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 0, 16, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 17, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 1, 0, 16, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 1, 0, 16, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 16, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 17, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 0, 16, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 1, 0, 16, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 16, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 17, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 1, 0, 16, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 1, 0, 16, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 1, 0, 16, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 1, 0, 16, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 1, 0, 16, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 1, 0, 16, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 0, 16, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 1, 0, 16, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 1, 0, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{2, 0, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{5, 1, 0, 16, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 0, 16, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 17, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 1, 0, 16, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 0, 16, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 17, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{8, 1, 0, 16, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 1, 0, 16, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 0, 16, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 17, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 1, 0, 16, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 1, 0, 16, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 1, 0, 16, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 1, 0, 16, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 0, 16, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 17, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{6, 1, 0, 16, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 0, 16, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{4, 0, 16, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 17, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{11, 1, 0, 16, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 0, 16, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 17, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 1, 0, 16, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 1, 0, 16, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 16, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 17, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 0, 16, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 1, 0, 16, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 1, 0, 16, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 0, 16, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 17, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 1, 0, 16, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 1, 0, 16, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{10, 1, 0, 16, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 16, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 17, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 1, 0, 16, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 1, 0, 16, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 1, 0, 16, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 1, 0, 16, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 1, 0, 16, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 1, 0, 16, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 0, 16, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 1, 0, 16, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 16, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 17, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 1, 0, 16, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 1, 0, 16, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 1, 0, 16, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 0, 16, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 1, 0, 16, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 0, 16, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 17, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{6, 1, 0, 16, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 0, 16, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{4, 0, 16, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 17, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{9, 1, 0, 16, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 16, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 17, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 1, 0, 16, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 1, 0, 16, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 0, 16, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 0, 16, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 16, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 17, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 1, 0, 16, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{4, 0, 16, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 17, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 1, 0, 16, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 0, 16, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 17, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 1, 0, 16, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 16, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 17, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
};
|
|
#else
|
|
// 1 byte for length, 16 bytes for mask
|
|
const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
|
|
{12, 0, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{11, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{10, 16, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 0, 1, 17, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{11, 0, 1, 17, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 1, 17, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 16, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 0, 1, 17, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 17, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 16, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 1, 17, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 1, 17, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 0, 1, 17, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 1, 17, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{11, 0, 1, 17, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 1, 17, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 16, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 1, 17, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 0, 1, 17, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 17, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 16, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 1, 17, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 0, 1, 17, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 17, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 16, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 1, 17, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 0, 1, 17, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 1, 17, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 0, 1, 17, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{6, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 1, 17, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 0, 1, 17, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 1, 17, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 0, 1, 17, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 0, 1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{2, 1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{5, 0, 1, 17, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 1, 17, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 16, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 0, 1, 17, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 1, 17, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 16, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{8, 0, 1, 17, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 1, 17, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 1, 17, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 16, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 0, 1, 17, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 0, 1, 17, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 0, 1, 17, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 1, 17, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 1, 17, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 16, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{6, 0, 1, 17, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 1, 17, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{4, 1, 17, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 16, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{11, 0, 1, 17, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{8, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{10, 1, 17, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{9, 16, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 1, 17, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 0, 1, 17, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 17, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 16, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 0, 1, 17, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 1, 17, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 1, 17, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{4, 1, 17, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 16, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 0, 1, 17, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 0, 1, 17, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{10, 0, 1, 17, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 17, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 16, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 1, 17, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 0, 1, 17, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 1, 17, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 0, 1, 17, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 1, 17, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 0, 1, 17, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 1, 17, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{10, 0, 1, 17, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
|
|
{7, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{9, 1, 17, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 16, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 0, 1, 17, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{9, 0, 1, 17, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{8, 0, 1, 17, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 1, 17, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 0, 1, 17, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{1, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{3, 1, 17, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{2, 16, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{6, 0, 1, 17, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 0, 1, 17, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{4, 1, 17, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 16, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{9, 0, 1, 17, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 1, 17, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{7, 16, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 0, 1, 17, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{8, 0, 1, 17, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 0, 1, 17, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{8, 0, 1, 17, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{5, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{7, 1, 17, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{6, 16, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 0, 1, 17, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{2, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80},
|
|
{4, 1, 17, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{3, 16, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{7, 0, 1, 17, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80},
|
|
{4, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{6, 1, 17, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{5, 16, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{6, 0, 1, 17, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{3, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
{5, 1, 17, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80},
|
|
{4, 16, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80},
|
|
};
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
} // namespace ppc64_utf16_to_utf8
|
|
} // namespace tables
|
|
} // unnamed namespace
|
|
} // namespace simdutf
|
|
|
|
#endif // PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
/* end file src/ppc64/ppc64_utf16_to_utf8_tables.h */
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
#ifndef SIMDUTF_PPC64_H
|
|
#error "ppc64.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
|
|
// careful: 0x80 is not ascii.
|
|
return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
|
|
}
|
|
|
|
simdutf_really_inline simd8<bool>
|
|
must_be_2_3_continuation(const simd8<uint8_t> prev2,
|
|
const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte =
|
|
prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte =
|
|
prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction
|
|
// will be <= 64, so signed comparison is fine.
|
|
return simd8<bool>(is_third_byte | is_fourth_byte);
|
|
}
|
|
|
|
/// ErrorReporting describes behaviour of a vectorized procedure regarding error
|
|
/// checking
|
|
enum class ErrorReporting {
|
|
precise, // the procedure will report *approximate* or *precise* error
|
|
// position
|
|
at_the_end, // the procedure will only inform about an error after scanning
|
|
// the whole input (or its significant portion)
|
|
none, // no error checking is done, we assume valid inputs
|
|
};
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/ppc64/ppc64_validate_utf16.cpp */
|
|
template <endianness big_endian>
|
|
simd8<uint8_t> utf16_gather_high_bytes(const simd16<uint16_t> in0,
|
|
const simd16<uint16_t> in1) {
|
|
if (big_endian) {
|
|
const vec_u8_t pack_high = {
|
|
0, 2, 4, 6, 8, 10, 12, 14, // in0
|
|
16, 18, 20, 22, 24, 26, 28, 30 // in1
|
|
};
|
|
|
|
return vec_perm(vec_u8_t(in0.value), vec_u8_t(in1.value), pack_high);
|
|
} else {
|
|
const vec_u8_t pack_high = {
|
|
1, 3, 5, 7, 9, 11, 13, 15, // in0
|
|
17, 19, 21, 23, 25, 27, 29, 31 // in1
|
|
};
|
|
|
|
return vec_perm(vec_u8_t(in0.value), vec_u8_t(in1.value), pack_high);
|
|
}
|
|
}
|
|
/* end file src/ppc64/ppc64_validate_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/ppc64/ppc64_convert_latin1_to_utf8.cpp */
|
|
/*
|
|
* reads a vector of uint16 values
|
|
* bits after 11th are ignored
|
|
* first 11 bits are encoded into utf8
|
|
* !important! utf8_output must have at least 16 writable bytes
|
|
*/
|
|
simdutf_really_inline void
|
|
write_v_u16_11bits_to_utf8(const vector_u16 v_u16, char *&utf8_output,
|
|
const vector_u8 one_byte_bytemask,
|
|
const uint16_t one_byte_bitmask) {
|
|
|
|
// 0b1100_0000_1000_0000
|
|
const auto v_c080 = vector_u16(0xc080);
|
|
// 0b0011_1111_0000_0000
|
|
const auto v_1f00 = vector_u16(0x1f00);
|
|
// 0b0000_0000_0011_1111
|
|
const auto v_003f = vector_u16(0x003f);
|
|
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
|
|
// t0 = [0000|0000|00bb|bbbb]
|
|
const auto t0 = v_u16 & v_003f;
|
|
// t1 = [000a|aaaa|bbbb|bb00]
|
|
const auto t1 = v_u16.shl<2>();
|
|
// t2 = [000a|aaaa|00bb|bbbb]
|
|
const auto t2 = select(v_1f00, t1, t0);
|
|
// t3 = [110a|aaaa|10bb|bbbb]
|
|
const auto t3 = t2 | v_c080;
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const auto utf8_unpacked1 =
|
|
select(one_byte_bytemask, as_vector_u8(v_u16), as_vector_u8(t3));
|
|
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto tmp = as_vector_u16(utf8_unpacked1).swap_bytes();
|
|
#else
|
|
const auto tmp = as_vector_u16(utf8_unpacked1);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto utf8_unpacked = as_vector_u8(tmp);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
|
|
// - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const auto shuffle = vector_u8::load(row + 1);
|
|
const auto utf8_packed = shuffle.lookup_16(utf8_unpacked);
|
|
|
|
// 5. store bytes
|
|
utf8_packed.store(utf8_output);
|
|
|
|
// 6. adjust pointers
|
|
utf8_output += row[0];
|
|
}
|
|
|
|
inline void write_v_u16_11bits_to_utf8(const vector_u16 v_u16,
|
|
char *&utf8_output,
|
|
const vector_u16 v_0000,
|
|
const vector_u16 v_ff80) {
|
|
// no bits set above 7th bit
|
|
const auto one_byte_bytemask = (v_u16 & v_ff80) == v_0000;
|
|
const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask();
|
|
|
|
write_v_u16_11bits_to_utf8(v_u16, utf8_output,
|
|
as_vector_u8(one_byte_bytemask), one_byte_bitmask);
|
|
}
|
|
|
|
std::pair<const char *const, char *const>
|
|
ppc64_convert_latin1_to_utf8(const char *latin_input,
|
|
const size_t latin_input_length,
|
|
char *utf8_output) {
|
|
const char *end = latin_input + latin_input_length;
|
|
|
|
const auto v_0000 = vector_u16::zero();
|
|
const auto v_00 = vector_u8::zero();
|
|
|
|
// 0b1111_1111_1000_0000
|
|
const auto v_ff80 = vector_u16(0xff80);
|
|
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto latin_1_half_into_u16_byte_mask =
|
|
vector_u8(16, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7);
|
|
const auto latin_2_half_into_u16_byte_mask =
|
|
vector_u8(16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15);
|
|
#else
|
|
const auto latin_1_half_into_u16_byte_mask =
|
|
vector_u8(0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16);
|
|
const auto latin_2_half_into_u16_byte_mask =
|
|
vector_u8(8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15, 16);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
|
|
// each latin1 takes 1-2 utf8 bytes
|
|
// slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
|
|
// adjust the pointer) so the last write can exceed the utf8_output size by
|
|
// 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
|
|
// 8-16 bytes free
|
|
while (end - latin_input >= 16 + 8) {
|
|
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
|
|
const auto v_latin = vector_u8::load(latin_input);
|
|
|
|
if (v_latin.is_ascii()) { // ASCII fast path!!!!
|
|
v_latin.store(utf8_output);
|
|
latin_input += 16;
|
|
utf8_output += 16;
|
|
continue;
|
|
}
|
|
|
|
// assuming a/b are bytes and A/B are uint16 of the same value
|
|
// aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
|
|
const vector_u16 v_u16_latin_1_half =
|
|
as_vector_u16(latin_1_half_into_u16_byte_mask.lookup_32(v_latin, v_00));
|
|
|
|
// aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
|
|
const vector_u16 v_u16_latin_2_half =
|
|
as_vector_u16(latin_2_half_into_u16_byte_mask.lookup_32(v_latin, v_00));
|
|
|
|
write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
|
|
write_v_u16_11bits_to_utf8(v_u16_latin_2_half, utf8_output, v_0000, v_ff80);
|
|
latin_input += 16;
|
|
}
|
|
|
|
if (end - latin_input >= 16) {
|
|
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
|
|
const auto v_latin = vector_u8::load(latin_input);
|
|
|
|
if (v_latin.is_ascii()) { // ASCII fast path!!!!
|
|
v_latin.store(utf8_output);
|
|
latin_input += 16;
|
|
utf8_output += 16;
|
|
} else {
|
|
// assuming a/b are bytes and A/B are uint16 of the same value
|
|
// aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
|
|
const auto v_u16_latin_1_half = as_vector_u16(
|
|
latin_1_half_into_u16_byte_mask.lookup_32(v_latin, v_00));
|
|
|
|
write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000,
|
|
v_ff80);
|
|
latin_input += 8;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(latin_input, utf8_output);
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_latin1_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/ppc64/ppc64_convert_latin1_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
size_t ppc64_convert_latin1_to_utf16(const char *latin1_input, size_t len,
|
|
char16_t *utf16_output) {
|
|
const size_t rounded_len = align_down<vector_u8::ELEMENTS>(len);
|
|
|
|
for (size_t i = 0; i < rounded_len; i += vector_u8::ELEMENTS) {
|
|
const auto in = vector_u8::load(&latin1_input[i]);
|
|
in.store_bytes_as_utf16<big_endian>(&utf16_output[i]);
|
|
}
|
|
|
|
return rounded_len;
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_latin1_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/ppc64/ppc64_convert_latin1_to_utf32.cpp */
|
|
std::pair<const char *, char32_t *>
|
|
ppc64_convert_latin1_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const size_t rounded_len = align_down<vector_u8::ELEMENTS>(len);
|
|
|
|
for (size_t i = 0; i < rounded_len; i += vector_u8::ELEMENTS) {
|
|
const auto in = vector_u8::load(&buf[i]);
|
|
in.store_bytes_as_utf32(&utf32_output[i]);
|
|
}
|
|
|
|
return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_latin1_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_LATIN1 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/ppc64/ppc64_convert_utf8_to_latin1.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_latin1(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char *&latin1_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
const auto in = vector_u8::load(input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask &
|
|
0xfff; // we are only processing 12 bytes in case it is not all ASCII
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
in.store(latin1_output);
|
|
latin1_output += 12; // We wrote 12 characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
// this indicates an invalid input:
|
|
if (idx >= 64) {
|
|
return consumed;
|
|
}
|
|
// Here we should have (idx < 64), if not, there is a bug in the validation or
|
|
// elsewhere. SIX (6) input code-code units this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small lookup
|
|
// table.
|
|
|
|
const auto reshuffle = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
|
|
const auto perm8 = reshuffle.lookup_32(in, vector_u8::zero());
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm16 = as_vector_u16(perm8).swap_bytes();
|
|
#else
|
|
const auto perm16 = as_vector_u16(perm8);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto ascii = perm16 & uint16_t(0x7f);
|
|
const auto highbyte = perm16 & uint16_t(0x1f00);
|
|
const auto composed = ascii | highbyte.shr<2>();
|
|
|
|
const auto latin1_packed = vector_u16::pack(composed, composed);
|
|
#if defined(__clang__)
|
|
__attribute__((aligned(16))) char buf[16];
|
|
latin1_packed.store(buf);
|
|
memcpy(latin1_output, buf, 6);
|
|
#else
|
|
// writing 8 bytes even though we only care about the first 6 bytes.
|
|
const auto tmp = vec_u64_t(latin1_packed.value);
|
|
memcpy(latin1_output, &tmp[0], 8);
|
|
#endif
|
|
latin1_output += 6; // We wrote 6 bytes.
|
|
return consumed;
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf8_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/ppc64/ppc64_convert_utf8_to_utf16.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const auto in = vector_u8::load(input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
// Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
|
|
in.store_bytes_as_utf16<big_endian>(utf16_output);
|
|
utf16_output += 12; // We wrote 12 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
|
|
// UTF-16 code units.
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto in16 = as_vector_u16(in);
|
|
#else
|
|
const auto in16 = as_vector_u16(in).swap_bytes();
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto lo = in16 & uint16_t(0x007f);
|
|
const auto hi = in16.shr<2>();
|
|
|
|
auto composed = select(uint16_t(0x1f00 >> 2), hi, lo);
|
|
if (!match_system(big_endian)) {
|
|
composed = composed.swap_bytes();
|
|
}
|
|
|
|
composed.store(utf16_output);
|
|
utf16_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
|
|
// UTF-16 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
|
|
// AltiVec: it might be done better, for now SSE translation
|
|
|
|
const auto sh =
|
|
vector_u8(2, 1, 0, 16, 5, 4, 3, 16, 8, 7, 6, 16, 11, 10, 9, 16);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm =
|
|
as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
|
|
#else
|
|
const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto b0 = perm & uint32_t(0x0000007f);
|
|
const auto b1 = select(uint32_t(0x00003f00 >> 2), perm.shr<2>(), b0);
|
|
const auto b2 = select(uint32_t(0x000f0000 >> 4), perm.shr<4>(), b1);
|
|
const auto composed = b2;
|
|
auto packed = vector_u32::pack(composed, composed);
|
|
|
|
if (!match_system(big_endian)) {
|
|
packed = packed.swap_bytes();
|
|
}
|
|
|
|
packed.store(utf16_output);
|
|
utf16_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small
|
|
// lookup table.
|
|
const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm =
|
|
as_vector_u16(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
|
|
#else
|
|
const auto perm = as_vector_u16(sh.lookup_32(in, vector_u8::zero()));
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto b0 = perm & uint16_t(0x007f);
|
|
const auto b1 = perm & uint16_t(0x1f00);
|
|
|
|
auto composed = b0 | b1.shr<2>();
|
|
|
|
if (!match_system(big_endian)) {
|
|
composed = composed.swap_bytes();
|
|
}
|
|
|
|
composed.store(utf16_output);
|
|
utf16_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm =
|
|
as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
|
|
#else
|
|
const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto b0 = perm & uint32_t(0x0000007f);
|
|
const auto b1 = perm & uint32_t(0x00003f00);
|
|
const auto b2 = perm & uint32_t(0x000f0000);
|
|
|
|
const auto composed = b0 | b1.shr<2>() | b2.shr<4>();
|
|
|
|
auto packed = vector_u32::pack(composed, composed);
|
|
|
|
if (!match_system(big_endian)) {
|
|
packed = packed.swap_bytes();
|
|
}
|
|
|
|
packed.store(utf16_output);
|
|
utf16_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-code units
|
|
//////////////
|
|
// There might be garbage inputs where a leading byte mascarades as a
|
|
// four-byte leading byte (by being followed by 3 continuation byte), but is
|
|
// not greater than 0xf0. This could trigger a buffer overflow if we only
|
|
// counted leading bytes of the form 0xf0 as generating surrogate pairs,
|
|
// without further UTF-8 validation. Thus we must be careful to ensure that
|
|
// only leading bytes at least as large as 0xf0 generate surrogate pairs. We
|
|
// do as at the cost of an extra mask.
|
|
/////////////
|
|
const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm =
|
|
as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
|
|
#else
|
|
const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto ascii = perm & uint32_t(0x00000007f);
|
|
const auto middlebyte = perm & uint32_t(0x00003f00);
|
|
const auto middlebyte_shifted = middlebyte.shr<2>();
|
|
|
|
auto middlehighbyte = perm & uint32_t(0x003f0000);
|
|
// correct for spurious high bit
|
|
|
|
const auto correct = (perm & uint32_t(0x00400000)).shr<1>();
|
|
middlehighbyte = correct ^ middlehighbyte;
|
|
const auto middlehighbyte_shifted = middlehighbyte.shr<4>();
|
|
// We deliberately carry the leading four bits in highbyte if they are
|
|
// present, we remove them later when computing hightenbits.
|
|
const auto highbyte = perm & uint32_t(0xff000000);
|
|
const auto highbyte_shifted = highbyte.shr<6>();
|
|
// When we need to generate a surrogate pair (leading byte > 0xF0), then
|
|
// the corresponding 32-bit value in 'composed' will be greater than
|
|
// > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
|
|
// location of the surrogate pairs.
|
|
const auto composed =
|
|
ascii | middlebyte_shifted | highbyte_shifted | middlehighbyte_shifted;
|
|
|
|
const auto composedminus = composed - uint32_t(0x10000);
|
|
const auto lowtenbits = composedminus & uint32_t(0x3ff);
|
|
// Notice the 0x3ff mask:
|
|
const auto hightenbits = composedminus.shr<10>() & uint32_t(0x3ff);
|
|
const auto lowtenbitsadd = lowtenbits + uint32_t(0xDC00);
|
|
const auto hightenbitsadd = hightenbits + uint32_t(0xD800);
|
|
const auto lowtenbitsaddshifted = lowtenbitsadd.shl<16>();
|
|
auto surrogates = hightenbitsadd | lowtenbitsaddshifted;
|
|
|
|
uint32_t basic_buffer[4];
|
|
composed.store(basic_buffer);
|
|
uint32_t surrogate_buffer[4];
|
|
surrogates.swap_bytes().store(surrogate_buffer);
|
|
|
|
for (size_t i = 0; i < 3; i++) {
|
|
if (basic_buffer[i] > 0x3c00000) {
|
|
const auto ch0 = uint16_t(surrogate_buffer[i] & 0xffff);
|
|
const auto ch1 = uint16_t(surrogate_buffer[i] >> 16);
|
|
if (match_system(big_endian)) {
|
|
utf16_output[1] = scalar::u16_swap_bytes(ch0);
|
|
utf16_output[0] = scalar::u16_swap_bytes(ch1);
|
|
} else {
|
|
utf16_output[1] = ch0;
|
|
utf16_output[0] = ch1;
|
|
}
|
|
utf16_output += 2;
|
|
} else {
|
|
const auto chr = uint16_t(basic_buffer[i]);
|
|
if (match_system(big_endian)) {
|
|
utf16_output[0] = chr;
|
|
} else {
|
|
utf16_output[0] = scalar::u16_swap_bytes(chr);
|
|
}
|
|
|
|
utf16_output++;
|
|
}
|
|
}
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf8_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/ppc64/ppc64_convert_utf8_to_utf32.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const auto in = vector_u8::load(input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
in.store_bytes_as_utf32(utf32_output);
|
|
utf32_output += 12; // We wrote 12 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
|
|
// UTF-32 code units.
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm = as_vector_u16(in);
|
|
#else
|
|
const auto perm = as_vector_u16(in).swap_bytes();
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
// in = [110aaaaa|10bbbbbb]
|
|
// t0 = [00000000|00bbbbbb]
|
|
const auto t0 = perm & uint16_t(0x007f);
|
|
|
|
// t1 = [00110aaa|aabbbbbb]
|
|
const auto t1 = perm.shr<2>();
|
|
const auto composed = select(uint16_t(0x1f00 >> 2), t1, t0);
|
|
|
|
const auto composed8 = as_vector_u8(composed);
|
|
composed8.store_words_as_utf32(utf32_output);
|
|
|
|
utf32_output += 8; // We wrote 32 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
|
|
// UTF-32 code units.
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto sh =
|
|
vector_u8(-1, 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11);
|
|
#else
|
|
const auto sh =
|
|
vector_u8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
|
|
|
|
// in = [1110aaaa|10bbbbbb|10cccccc]
|
|
|
|
// t0 = [00000000|00000000|00cccccc]
|
|
const auto t0 = perm & uint32_t(0x0000007f);
|
|
|
|
// t2 = [00000000|0000bbbb|bbcccccc]
|
|
const auto t1 = perm.shr<2>();
|
|
const auto t2 = select(uint32_t(0x00003f00 >> 2), t1, t0);
|
|
|
|
// t4 = [00000000|aaaabbbb|bbcccccc]
|
|
const auto t3 = perm.shr<4>();
|
|
const auto t4 = select(uint32_t(0x0f0000 >> 4), t3, t2);
|
|
|
|
t4.store(utf32_output);
|
|
utf32_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small
|
|
// lookup table.
|
|
const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm =
|
|
as_vector_u16(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
|
|
#else
|
|
const auto perm = as_vector_u16(sh.lookup_32(in, vector_u8::zero()));
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto ascii = perm & uint16_t(0x7f);
|
|
const auto highbyte = perm & uint16_t(0x1f00);
|
|
const auto composed = ascii | highbyte.shr<2>();
|
|
|
|
as_vector_u8(composed).store_words_as_utf32(utf32_output);
|
|
utf32_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm =
|
|
as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
|
|
#else
|
|
const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto ascii = perm & uint32_t(0x7f);
|
|
const auto middlebyte = perm & uint32_t(0x3f00);
|
|
const auto middlebyte_shifted = middlebyte.shr<2>();
|
|
const auto highbyte = perm & uint32_t(0x0f0000);
|
|
const auto highbyte_shifted = highbyte.shr<4>();
|
|
const auto composed = ascii | middlebyte_shifted | highbyte_shifted;
|
|
|
|
composed.store(utf32_output);
|
|
utf32_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-code units
|
|
const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto perm =
|
|
as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
|
|
#else
|
|
const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto ascii = perm & uint32_t(0x0000007f);
|
|
const auto middlebyte = perm & uint32_t(0x3f00);
|
|
const auto middlebyte_shifted = middlebyte.shr<2>();
|
|
auto middlehighbyte = perm & uint32_t(0x003f0000);
|
|
// correct for spurious high bit
|
|
const auto correct0 = perm & uint32_t(0x00400000);
|
|
const auto correct = correct0.shr<1>();
|
|
middlehighbyte = correct ^ middlehighbyte;
|
|
const auto middlehighbyte_shifted = middlehighbyte.shr<4>();
|
|
const auto highbyte = perm & uint32_t(0x07000000);
|
|
const auto highbyte_shifted = highbyte.shr<6>();
|
|
const auto composed =
|
|
ascii | middlebyte_shifted | highbyte_shifted | middlehighbyte_shifted;
|
|
composed.store(utf32_output);
|
|
utf32_output += 3;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf8_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/ppc64/ppc64_convert_utf16_to_latin1.cpp */
|
|
struct utf16_to_latin1_t {
|
|
error_code err;
|
|
const char16_t *input;
|
|
char *output;
|
|
};
|
|
|
|
template <endianness big_endian>
|
|
utf16_to_latin1_t ppc64_convert_utf16_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 8) {
|
|
// Load 8 x UTF-16 characters
|
|
auto in = vector_u8::load(buf);
|
|
|
|
// Move low bytes of UTF-16 chars to lower half of `in`
|
|
// and upper bytes to upper half of `in`.
|
|
if (!match_system(big_endian)) {
|
|
const auto perm =
|
|
vector_u8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
|
|
in = perm.lookup_16(in);
|
|
} else {
|
|
const auto perm =
|
|
vector_u8(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14);
|
|
in = perm.lookup_16(in);
|
|
}
|
|
|
|
// AltiVec-specific
|
|
#if defined(__clang__)
|
|
__attribute__((aligned(16))) uint64_t tmp[8];
|
|
in.store(tmp);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
memcpy(latin1_output, &tmp[0], 8);
|
|
const uint64_t upper = tmp[1];
|
|
#else
|
|
memcpy(latin1_output, &tmp[1], 8);
|
|
const uint64_t upper = tmp[0];
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
#else
|
|
const auto tmp = vec_u64_t(in.value);
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
memcpy(latin1_output, &tmp[0], 8);
|
|
const uint64_t upper = tmp[1];
|
|
#else
|
|
memcpy(latin1_output, &tmp[1], 8);
|
|
const uint64_t upper = tmp[0];
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
#endif // defined(__clang__)
|
|
// AltiVec
|
|
|
|
if (simdutf_unlikely(upper)) {
|
|
uint8_t bytes[8];
|
|
memcpy(bytes, &upper, 8);
|
|
for (size_t k = 0; k < 8; k++) {
|
|
if (bytes[k] != 0) {
|
|
return utf16_to_latin1_t{error_code::TOO_LARGE, buf + k,
|
|
latin1_output};
|
|
}
|
|
}
|
|
} else {
|
|
// Adjust pointers for next iteration
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
}
|
|
} // while
|
|
|
|
return utf16_to_latin1_t{error_code::SUCCESS, buf, latin1_output};
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf16_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/ppc64/ppc64_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it is an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
// Auxiliary procedure used by UTF-16 and UTF-32 into UTF-8.
|
|
// Note the pointer is passed by reference, it is updated by the procedure.
|
|
template <typename T>
|
|
simdutf_really_inline void ppc64_convert_utf16_to_1_2_3_bytes_of_utf8(
|
|
const vector_u16 in, uint16_t one_byte_bitmask,
|
|
const T one_or_two_bytes_bytemask, uint16_t one_or_two_bytes_bitmask,
|
|
char *&utf8_output) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto dup_lsb =
|
|
vector_u8(1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
|
|
#else
|
|
const auto dup_lsb =
|
|
vector_u8(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const auto t0 = as_vector_u16(dup_lsb.lookup_16(as_vector_u8(in)));
|
|
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const auto t1 = t0 & uint16_t(0b0011111101111111);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const auto t2 = t1 | uint16_t(0b1000000000000000);
|
|
|
|
// in = [aaaa|bbbb|bbcc|cccc]
|
|
// a0 = [0000|0000|0000|aaaa]
|
|
const auto a0 = in.shr<12>();
|
|
// b0 = [aabb|bbbb|cccc|cc00]
|
|
const auto b0 = in.shl<2>();
|
|
// s0 = [00bb|bbbb|00cc|cccc]
|
|
const auto s0 = select(uint16_t(0x3f00), b0, a0);
|
|
|
|
// s3 = [11bb|bbbb|1110|aaaa]
|
|
const auto s3 = s0 | uint16_t(0b1100000011100000);
|
|
|
|
const auto m0 =
|
|
~as_vector_u16(one_or_two_bytes_bytemask) & uint16_t(0b0100000000000000);
|
|
const auto s4 = s3 ^ m0;
|
|
|
|
// 4. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask =
|
|
(one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
|
|
if (mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
// Lookups produced by scripts/ppc64_convert_utf16_to_utf8.py
|
|
const auto shuffle0 =
|
|
vector_u8(1, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 9, 8, 24, 11);
|
|
const auto shuffle1 = vector_u8(10, 26, 13, 12, 28, 15, 14, 30, -1, -1, -1,
|
|
-1, -1, -1, -1, -1);
|
|
#else
|
|
const auto shuffle0 =
|
|
vector_u8(0, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 8, 9, 25, 10);
|
|
const auto shuffle1 = vector_u8(11, 27, 12, 13, 29, 14, 15, 31, -1, -1, -1,
|
|
-1, -1, -1, -1, -1);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto utf8_0 = shuffle0.lookup_32(as_vector_u8(s4), as_vector_u8(t2));
|
|
const auto utf8_1 = shuffle1.lookup_32(as_vector_u8(s4), as_vector_u8(t2));
|
|
|
|
utf8_0.store(utf8_output);
|
|
utf8_output += 16;
|
|
utf8_1.store(utf8_output);
|
|
utf8_output += 8;
|
|
return;
|
|
}
|
|
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::ppc64_utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const auto shuffle0 = vector_u8::load(row0 + 1);
|
|
|
|
const auto utf8_0 = shuffle0.lookup_32(as_vector_u8(s4), as_vector_u8(t2));
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::ppc64_utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const auto shuffle1 = vector_u8::load(row1 + 1) + uint8_t(8);
|
|
const auto utf8_1 = shuffle1.lookup_32(as_vector_u8(s4), as_vector_u8(t2));
|
|
|
|
utf8_0.store(utf8_output);
|
|
utf8_output += row0[0];
|
|
utf8_1.store(utf8_output);
|
|
utf8_output += row1[0];
|
|
}
|
|
|
|
struct utf16_to_utf8_t {
|
|
error_code err;
|
|
const char16_t *input;
|
|
char *output;
|
|
};
|
|
|
|
/*
|
|
Returns utf16_to_utf8_t value
|
|
A scalar routine should carry on the conversion of the tail,
|
|
iff there was no error.
|
|
*/
|
|
template <endianness big_endian>
|
|
utf16_to_utf8_t ppc64_convert_utf16_to_utf8(const char16_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
|
|
const char16_t *end = buf + len;
|
|
|
|
const auto v_f800 = vector_u16(0xf800);
|
|
const auto v_d800 = vector_u16(0xd800);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
auto in = vector_u16::load(buf);
|
|
if (not match_system(big_endian)) {
|
|
in = in.swap_bytes();
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
if (in.is_ascii()) {
|
|
auto nextin = vector_u16::load(buf + vector_u16::ELEMENTS);
|
|
if (not match_system(big_endian)) {
|
|
nextin = nextin.swap_bytes();
|
|
}
|
|
|
|
if (nextin.is_ascii()) {
|
|
// 1. pack the bytes
|
|
const auto utf8_packed = vector_u16::pack(in, nextin);
|
|
// 2. store (16 bytes)
|
|
utf8_packed.store(utf8_output);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
// next block is not ASCII
|
|
const auto utf8_packed = vector_u16::pack(in, in);
|
|
// 2. store (16 bytes)
|
|
utf8_packed.store(utf8_output);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
// fallback
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const auto one_byte_bytemask = in < uint16_t(1 << 7);
|
|
const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask();
|
|
|
|
// no bits set above 11th bit
|
|
const auto one_or_two_bytes_bytemask = in < uint16_t(1 << 11);
|
|
const uint16_t one_or_two_bytes_bitmask =
|
|
one_or_two_bytes_bytemask.to_bitmask();
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
write_v_u16_11bits_to_utf8(
|
|
in, utf8_output, as_vector_u8(one_byte_bytemask), one_byte_bitmask);
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also to deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const auto surrogates_bytemask = (in & v_f800) == v_d800;
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask = surrogates_bytemask.to_bitmask();
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
ppc64_convert_utf16_to_1_2_3_bytes_of_utf8(
|
|
in, one_byte_bitmask, one_or_two_bytes_bytemask,
|
|
one_or_two_bytes_bitmask, utf8_output);
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = not match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k])
|
|
: buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = uint8_t(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = uint8_t((word >> 6) | 0b11000000);
|
|
*utf8_output++ = uint8_t((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = uint8_t((word >> 12) | 0b11100000);
|
|
*utf8_output++ = uint8_t(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = uint8_t((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = not match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return utf16_to_utf8_t{error_code::SURROGATE, buf + k - 1,
|
|
utf8_output};
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = uint8_t((value >> 18) | 0b11110000);
|
|
*utf8_output++ = uint8_t(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = uint8_t(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = uint8_t((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return utf16_to_utf8_t{error_code::SUCCESS, buf, utf8_output};
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf16_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/ppc64/ppc64_convert_utf16_to_utf32.cpp */
|
|
struct utf16_to_utf32_t {
|
|
error_code err; // error code
|
|
const char16_t *input; // last position in input buffer
|
|
char32_t *output; // last position in output buffer
|
|
};
|
|
|
|
template <endianness big_endian>
|
|
utf16_to_utf32_t ppc64_convert_utf16_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char16_t *end = buf + len;
|
|
|
|
const auto v_f800 = vector_u16::splat(0xf800);
|
|
const auto v_d800 = vector_u16::splat(0xd800);
|
|
const auto zero = vector_u8::zero();
|
|
|
|
while (end - buf >= vector_u16::ELEMENTS) {
|
|
auto in = vector_u16::load(buf);
|
|
if (not match_system(big_endian)) {
|
|
in = in.swap_bytes();
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const auto surrogates_bytemask = (in & v_f800) == v_d800;
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
const uint16_t surrogates_bitmask = surrogates_bytemask.to_bitmask();
|
|
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: no surrogate pairs, extend 16-bit code units to 32-bit code units
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto lo =
|
|
vector_u8(16, 16, 0, 1, 16, 16, 2, 3, 16, 16, 4, 5, 16, 16, 6, 7);
|
|
const auto hi = vector_u8(16, 16, 8 + 0, 8 + 1, 16, 16, 8 + 2, 8 + 3, 16,
|
|
16, 8 + 4, 8 + 5, 16, 16, 8 + 6, 8 + 7);
|
|
#else
|
|
const auto lo =
|
|
vector_u8(0, 1, 16, 16, 2, 3, 16, 16, 4, 5, 16, 16, 6, 7, 16, 16);
|
|
const auto hi = vector_u8(8 + 0, 8 + 1, 16, 16, 8 + 2, 8 + 3, 16, 16,
|
|
8 + 4, 8 + 5, 16, 16, 8 + 6, 8 + 7, 16, 16);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
|
|
const auto utf32_0 = lo.lookup_32(as_vector_u8(in), zero);
|
|
const auto utf32_1 = hi.lookup_32(as_vector_u8(in), zero);
|
|
|
|
utf32_0.store(utf32_output);
|
|
utf32_1.store(utf32_output + 4);
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
const uint16_t word = not match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k])
|
|
: buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = not match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return utf16_to_utf32_t{error_code::SURROGATE, buf + k - 1,
|
|
utf32_output};
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return utf16_to_utf32_t{error_code::SUCCESS, buf, utf32_output};
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf16_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/ppc64/ppc64_convert_utf32_to_latin1.cpp */
|
|
enum class ErrorChecking { disabled, enabled };
|
|
|
|
struct utf32_to_latin1_t {
|
|
error_code err;
|
|
const char32_t *input;
|
|
char *output;
|
|
};
|
|
|
|
template <ErrorChecking ec>
|
|
utf32_to_latin1_t simdutf_really_inline ppc64_convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) {
|
|
constexpr size_t N = vector_u32::ELEMENTS;
|
|
const size_t rounded_len = align_down<4 * N>(len);
|
|
|
|
const auto high_bytes_mask = vector_u32::splat(0xFFFFFF00);
|
|
|
|
for (size_t i = 0; i < rounded_len; i += 4 * N) {
|
|
const auto in1 = vector_u32::load(buf + 0 * N);
|
|
const auto in2 = vector_u32::load(buf + 1 * N);
|
|
const auto in3 = vector_u32::load(buf + 2 * N);
|
|
const auto in4 = vector_u32::load(buf + 3 * N);
|
|
|
|
if (ec == ErrorChecking::enabled) {
|
|
const auto combined = in1 | in2 | in3 | in4;
|
|
const auto too_big = (combined & high_bytes_mask) != uint32_t(0);
|
|
|
|
if (simdutf_unlikely(too_big.any())) {
|
|
// Scalar code will carry on from the beginning of the current block
|
|
// and report the exact error position.
|
|
return utf32_to_latin1_t{error_code::OTHER, buf, latin1_output};
|
|
}
|
|
}
|
|
|
|
// Note: element #1 contains 0, and is used to mask-out elements
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto shlo = vector_u8(0 + 3, 4 + 3, 8 + 3, 12 + 3, 16 + 3, 20 + 3,
|
|
24 + 3, 28 + 3, 1, 1, 1, 1, 1, 1, 1, 1);
|
|
const auto shhi = vector_u8(1, 1, 1, 1, 1, 1, 1, 1, 0 + 3, 4 + 3, 8 + 3,
|
|
12 + 3, 16 + 3, 20 + 3, 24 + 3, 28 + 3);
|
|
#else
|
|
const auto shlo =
|
|
vector_u8(0, 4, 8, 12, 16, 20, 24, 28, 1, 1, 1, 1, 1, 1, 1, 1);
|
|
const auto shhi =
|
|
vector_u8(1, 1, 1, 1, 1, 1, 1, 1, 0, 4, 8, 12, 16, 20, 24, 28);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto lo = shlo.lookup_32(as_vector_u8(in1), as_vector_u8(in2));
|
|
const auto hi = shhi.lookup_32(as_vector_u8(in3), as_vector_u8(in4));
|
|
|
|
const auto merged = lo | hi;
|
|
|
|
merged.store(latin1_output);
|
|
latin1_output += 4 * N;
|
|
buf += 4 * N;
|
|
}
|
|
|
|
return utf32_to_latin1_t{error_code::SUCCESS, buf, latin1_output};
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf32_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/ppc64/ppc64_convert_utf32_to_utf16.cpp */
|
|
struct utf32_to_utf16_t {
|
|
error_code err;
|
|
const char32_t *input;
|
|
char16_t *output;
|
|
};
|
|
|
|
template <endianness big_endian, ErrorReporting er>
|
|
utf32_to_utf16_t ppc64_convert_utf32_to_utf16(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
|
|
const char32_t *end = buf + len;
|
|
|
|
const auto zero = vector_u32::zero();
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
|
|
auto forbidden_global = simd16<bool>();
|
|
|
|
while (end - buf >= 8) {
|
|
const auto in0 = vector_u32::load(buf);
|
|
const auto in1 = vector_u32::load(buf + vector_u32::ELEMENTS);
|
|
|
|
const auto any_surrogate = ((in0 | in1) & v_ffff0000) != zero;
|
|
|
|
// Check if no bits set above 15th
|
|
if (any_surrogate.is_zero()) {
|
|
// Pack UTF-32 to UTF-16
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto sh = big_endian ? vector_u8(2, 3, 6, 7, 10, 11, 14, 15, 18, 19,
|
|
22, 23, 26, 27, 30, 31)
|
|
: vector_u8(3, 2, 7, 6, 11, 10, 15, 14, 19, 18,
|
|
23, 22, 27, 26, 31, 30);
|
|
#else
|
|
const auto sh = big_endian ? vector_u8(1, 0, 5, 4, 9, 8, 13, 12, 17, 16,
|
|
21, 20, 25, 24, 29, 28)
|
|
: vector_u8(0, 1, 4, 5, 8, 9, 12, 13, 16, 17,
|
|
20, 21, 24, 25, 28, 29);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto packed0 = sh.lookup_32(as_vector_u8(in0), as_vector_u8(in1));
|
|
const auto packed = as_vector_u16(packed0);
|
|
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
const auto v_f800 =
|
|
big_endian ? vector_u16::splat(0xf800) : vector_u16::splat(0x00f8);
|
|
const auto v_d800 =
|
|
big_endian ? vector_u16::splat(0xd800) : vector_u16::splat(0x00d8);
|
|
#else
|
|
const auto v_f800 =
|
|
big_endian ? vector_u16::splat(0x00f8) : vector_u16::splat(0xf800);
|
|
const auto v_d800 =
|
|
big_endian ? vector_u16::splat(0x00d8) : vector_u16::splat(0xd800);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
const auto forbidden = (packed & v_f800) == v_d800;
|
|
|
|
switch (er) {
|
|
case ErrorReporting::precise:
|
|
if (not forbidden.is_zero()) {
|
|
// scalar procedure will rescan the portion of buffer we've just
|
|
// analysed
|
|
return utf32_to_utf16_t{error_code::OTHER, buf, utf16_output};
|
|
}
|
|
break;
|
|
case ErrorReporting::at_the_end:
|
|
forbidden_global |= forbidden;
|
|
break;
|
|
case ErrorReporting::none:
|
|
break;
|
|
}
|
|
|
|
packed.store(utf16_output);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return utf32_to_utf16_t{error_code::SURROGATE, buf + k,
|
|
utf16_output};
|
|
}
|
|
*utf16_output++ = not match_system(big_endian)
|
|
? scalar::u16_swap_bytes(uint16_t(word))
|
|
: uint16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return utf32_to_utf16_t{error_code::TOO_LARGE, buf + k,
|
|
utf16_output};
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (not match_system(big_endian)) {
|
|
high_surrogate = scalar::u16_swap_bytes(high_surrogate);
|
|
low_surrogate = scalar::u16_swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
if (er == ErrorReporting::at_the_end) {
|
|
// check for invalid input
|
|
if (not forbidden_global.is_zero()) {
|
|
return utf32_to_utf16_t{error_code::SURROGATE, buf, utf16_output};
|
|
}
|
|
}
|
|
|
|
return utf32_to_utf16_t{error_code::SUCCESS, buf, utf16_output};
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf32_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/ppc64/ppc64_convert_utf32_to_utf8.cpp */
|
|
struct utf32_to_utf8_t {
|
|
error_code err;
|
|
const char32_t *input;
|
|
char *output;
|
|
};
|
|
|
|
template <ErrorReporting er>
|
|
utf32_to_utf8_t ppc64_convert_utf32_to_utf8(const char32_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const char32_t *end = buf + len;
|
|
|
|
const auto v_f800 = vector_u16::splat(0xf800);
|
|
const auto v_d800 = vector_u16::splat(0xd800);
|
|
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
const auto v_00000000 = vector_u32::zero();
|
|
auto forbidden_bytemask = simd16<bool>();
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >=
|
|
std::ptrdiff_t(
|
|
16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
|
|
// has 4 bytes or 32 bits, thus buf + 16 *
|
|
// char_32t = 512 bits = 64 bytes
|
|
// We load two 16 bytes registers for a total of 32 bytes or 16 characters.
|
|
// These two values can hold only 8 UTF32 chars
|
|
auto in0 = vector_u32::load(buf);
|
|
auto in1 = vector_u32::load(buf + vector_u32::ELEMENTS);
|
|
|
|
// Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
|
|
// saturation
|
|
auto in = vector_u32::pack(in0, in1);
|
|
|
|
// Try to apply UTF-16 => UTF-8 from ./ppc64_convert_utf16_to_utf8.cpp
|
|
|
|
// Check for ASCII fast path
|
|
|
|
// ASCII fast path!!!!
|
|
// We eagerly load another 32 bytes, hoping that they will be ASCII too.
|
|
// The intuition is that we try to collect 16 ASCII characters which
|
|
// requires a total of 64 bytes of input. If we fail, we just pass thirdin
|
|
// and fourthin as our new inputs.
|
|
if (in.is_ascii()) { // if the first two blocks are ASCII
|
|
const auto in2 = vector_u32::load(buf + 2 * vector_u32::ELEMENTS);
|
|
const auto in3 = vector_u32::load(buf + 3 * vector_u32::ELEMENTS);
|
|
|
|
const auto next = vector_u32::pack(in2, in3);
|
|
if (next.is_ascii()) {
|
|
// 1. pack the bytes
|
|
const auto utf8_packed = vector_u16::pack(in, next);
|
|
// 2. store (16 bytes)
|
|
utf8_packed.store(utf8_output);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
// `next` is not ASCII, write `in` and carry on with next
|
|
|
|
// 1. pack the bytes
|
|
const auto utf8_packed = vector_u16::pack(in, in);
|
|
utf8_packed.store(utf8_output);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
|
|
// Proceed with next input
|
|
in = next;
|
|
in0 = in2;
|
|
in1 = in3;
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const auto one_byte_bytemask = in < uint16_t(1 << 7);
|
|
const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask();
|
|
|
|
// no bits set above 11th bit
|
|
const auto one_or_two_bytes_bytemask = in < uint16_t(1 << 11);
|
|
const uint16_t one_or_two_bytes_bitmask =
|
|
one_or_two_bytes_bytemask.to_bitmask();
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
write_v_u16_11bits_to_utf8(
|
|
in, utf8_output, as_vector_u8(one_byte_bytemask), one_byte_bitmask);
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
|
|
// Check for overflow in packing
|
|
const auto saturation_bytemask = ((in0 | in1) & v_ffff0000) == v_00000000;
|
|
const uint16_t saturation_bitmask = saturation_bytemask.to_bitmask();
|
|
if (saturation_bitmask == 0xffff) {
|
|
switch (er) {
|
|
case ErrorReporting::precise: {
|
|
const auto forbidden = (in & v_f800) == v_d800;
|
|
if (forbidden.any()) {
|
|
// We return no error code, instead we force the scalar procedure
|
|
// to rescan the portion of input where we've just found an error.
|
|
return utf32_to_utf8_t{error_code::SUCCESS, buf, utf8_output};
|
|
}
|
|
} break;
|
|
case ErrorReporting::at_the_end:
|
|
forbidden_bytemask |= (in & v_f800) == v_d800;
|
|
break;
|
|
case ErrorReporting::none:
|
|
break;
|
|
}
|
|
|
|
ppc64_convert_utf16_to_1_2_3_bytes_of_utf8(
|
|
in, one_byte_bitmask, one_or_two_bytes_bytemask,
|
|
one_or_two_bytes_bitmask, utf8_output);
|
|
buf += 8;
|
|
} else {
|
|
// case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
|
|
// wasteful to use scalar code, but being efficient with SIMD in the
|
|
// presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (er != ErrorReporting::none and
|
|
(word >= 0xD800 && word <= 0xDFFF)) {
|
|
return utf32_to_utf8_t{error_code::SURROGATE, buf + k, utf8_output};
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (er != ErrorReporting::none and (word > 0x10FFFF)) {
|
|
return utf32_to_utf8_t{error_code::TOO_LARGE, buf + k, utf8_output};
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
if (er == ErrorReporting::at_the_end) {
|
|
if (forbidden_bytemask.any()) {
|
|
return utf32_to_utf8_t{error_code::SURROGATE, buf, utf8_output};
|
|
}
|
|
}
|
|
|
|
return utf32_to_utf8_t{
|
|
error_code::SUCCESS,
|
|
buf,
|
|
utf8_output,
|
|
};
|
|
}
|
|
/* end file src/ppc64/ppc64_convert_utf32_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/ppc64/ppc64_utf8_length_from_latin1.cpp */
|
|
template <typename T> T min(T a, T b) { return a <= b ? a : b; }
|
|
|
|
std::pair<const char *, size_t> ppc64_utf8_length_from_latin1(const char *input,
|
|
size_t length) {
|
|
constexpr size_t N = vector_u8::ELEMENTS;
|
|
length = (length / N);
|
|
|
|
size_t count = length * N;
|
|
while (length != 0) {
|
|
vector_u32 partial = vector_u32::zero();
|
|
|
|
// partial accumulator has 32 bits => this yields (2^31 / 16)
|
|
size_t chunk = min(length, size_t(0xffffffff / N));
|
|
length -= chunk;
|
|
while (chunk != 0) {
|
|
auto local = vector_u8::zero();
|
|
// local accumulator has 8 bits => this yields 255 max (we increment by 1
|
|
// in each iteration)
|
|
const size_t n = min(chunk, size_t(255));
|
|
chunk -= n;
|
|
for (size_t i = 0; i < n; i++) {
|
|
const auto in = vector_i8::load(input);
|
|
input += N;
|
|
|
|
local -= as_vector_u8(in < vector_i8::splat(0));
|
|
}
|
|
|
|
partial = sum4bytes(local, partial);
|
|
}
|
|
|
|
for (int i = 0; i < vector_u32::ELEMENTS; i++) {
|
|
count += size_t(partial.value[i]);
|
|
}
|
|
}
|
|
|
|
return std::make_pair(input, count);
|
|
}
|
|
/* end file src/ppc64/ppc64_utf8_length_from_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/ppc64/ppc64_base64.cpp */
|
|
/*
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*
|
|
* AMD XOP specific: http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
|
|
* Altivec has capabilites of AMD XOP (or vice versa): shuffle using 2 vectors
|
|
* and variable shifts, thus this implementation shares some code solution
|
|
* (modulo intrisic function names).
|
|
*/
|
|
|
|
constexpr bool with_base64_std = false;
|
|
constexpr bool with_base64_url = true;
|
|
constexpr bool with_ignore_errors = true;
|
|
constexpr bool with_ignore_garbage = true;
|
|
constexpr bool with_strict_checking = false;
|
|
|
|
// --- encoding -----------------------------------------------
|
|
|
|
/*
|
|
Procedure translates vector of bytes having 6-bit values
|
|
into ASCII counterparts.
|
|
*/
|
|
template <bool base64_url>
|
|
vector_u8 encoding_translate_6bit_values(const vector_u8 input) {
|
|
// credit: Wojciech Muła
|
|
// reduce 0..51 -> 0
|
|
// 52..61 -> 1 .. 10
|
|
// 62 -> 11
|
|
// 63 -> 12
|
|
auto result = input.saturating_sub(vector_u8::splat(51));
|
|
|
|
// distinguish between ranges 0..25 and 26..51:
|
|
// 0 .. 25 -> remains 13
|
|
// 26 .. 51 -> becomes 0
|
|
const auto lt = input < vector_u8::splat(26);
|
|
result = select(as_vector_u8(lt), vector_u8::splat(13), result);
|
|
|
|
const auto shift_LUT =
|
|
base64_url ? vector_u8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '-' - 62, '_' - 63, 'A', 0, 0)
|
|
: vector_u8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
|
|
// read shift
|
|
result = result.lookup_16(shift_LUT);
|
|
|
|
return input + result;
|
|
}
|
|
|
|
/*
|
|
Procedure expands 12 bytes (4*3 bytes) into 16 bytes,
|
|
each byte stores 6 bits of data
|
|
*/
|
|
template <typename = void>
|
|
simdutf_really_inline vector_u8 encoding_expand_6bit_fields(vector_u8 input) {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
#define indices4(dx) (dx + 0), (dx + 1), (dx + 1), (dx + 2)
|
|
const auto expand_3_to_4 = vector_u8(indices4(0 * 3), indices4(1 * 3),
|
|
indices4(2 * 3), indices4(3 * 3));
|
|
#undef indices4
|
|
|
|
// input = [........|ccdddddd|bbbbcccc|aaaaaabb] as uint8_t
|
|
// 3 2 1 0
|
|
//
|
|
// in' = [aaaaaabb|bbbbcccc|bbbbcccc|ccdddddd] as uint32_t
|
|
// 0 1 1 2
|
|
const auto in = as_vector_u32(expand_3_to_4.lookup_16(input));
|
|
|
|
// t0 = [00000000|00000000|00000000|00dddddd]
|
|
const auto t0 = in & uint32_t(0x0000003f);
|
|
|
|
// t1 = [00000000|00000000|00cccccc|00dddddd]
|
|
const auto t1 = select(uint32_t(0x00003f00), in.shl<2>(), t0);
|
|
|
|
// t2 = [00000000|00bbbbbb|00cccccc|00dddddd]
|
|
const auto t2 = select(uint32_t(0x003f0000), in.shr<4>(), t1);
|
|
|
|
// t3 = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
|
|
const auto t3 = select(uint32_t(0x3f000000), in.shr<2>(), t2);
|
|
|
|
return as_vector_u8(t3);
|
|
#else
|
|
#define indices4(dx) (dx + 1), (dx + 0), (dx + 2), (dx + 1)
|
|
const auto expand_3_to_4 = vector_u8(indices4(0 * 3), indices4(1 * 3),
|
|
indices4(2 * 3), indices4(3 * 3));
|
|
#undef indices4
|
|
|
|
// input = [........|ccdddddd|bbbbcccc|aaaaaabb] as uint8_t
|
|
// 3 2 1 0
|
|
//
|
|
// in' = [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] as uint32_t
|
|
// 1 2 0 1
|
|
const auto in = as_vector_u32(expand_3_to_4.lookup_16(input));
|
|
|
|
// t0 = [00dddddd|00000000|00000000|00000000]
|
|
const auto t0 = in.shl<8>() & uint32_t(0x3f000000);
|
|
|
|
// t1 = [00dddddd|00cccccc|00000000|00000000]
|
|
const auto t1 = select(uint32_t(0x003f0000), in.shr<6>(), t0);
|
|
|
|
// t2 = [00dddddd|00cccccc|00bbbbbb|00000000]
|
|
const auto t2 = select(uint32_t(0x00003f00), in.shl<4>(), t1);
|
|
|
|
// t3 = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
|
const auto t3 = select(uint32_t(0x0000003f), in.shr<10>(), t2);
|
|
|
|
return as_vector_u8(t3);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
}
|
|
|
|
template <bool isbase64url>
|
|
size_t encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
|
|
const uint8_t *input = (const uint8_t *)src;
|
|
|
|
uint8_t *out = (uint8_t *)dst;
|
|
|
|
size_t i = 0;
|
|
for (; i + 52 <= srclen; i += 48) {
|
|
const auto in0 = vector_u8::load(input + i + 12 * 0);
|
|
const auto in1 = vector_u8::load(input + i + 12 * 1);
|
|
const auto in2 = vector_u8::load(input + i + 12 * 2);
|
|
const auto in3 = vector_u8::load(input + i + 12 * 3);
|
|
|
|
const auto expanded0 = encoding_expand_6bit_fields(in0);
|
|
const auto expanded1 = encoding_expand_6bit_fields(in1);
|
|
const auto expanded2 = encoding_expand_6bit_fields(in2);
|
|
const auto expanded3 = encoding_expand_6bit_fields(in3);
|
|
|
|
const auto base64_0 =
|
|
encoding_translate_6bit_values<isbase64url>(expanded0);
|
|
const auto base64_1 =
|
|
encoding_translate_6bit_values<isbase64url>(expanded1);
|
|
const auto base64_2 =
|
|
encoding_translate_6bit_values<isbase64url>(expanded2);
|
|
const auto base64_3 =
|
|
encoding_translate_6bit_values<isbase64url>(expanded3);
|
|
|
|
base64_0.store(out);
|
|
out += 16;
|
|
|
|
base64_1.store(out);
|
|
out += 16;
|
|
|
|
base64_2.store(out);
|
|
out += 16;
|
|
|
|
base64_3.store(out);
|
|
out += 16;
|
|
}
|
|
for (; i + 16 <= srclen; i += 12) {
|
|
const auto in = vector_u8::load(input + i);
|
|
const auto expanded = encoding_expand_6bit_fields(in);
|
|
const auto base64 = encoding_translate_6bit_values<isbase64url>(expanded);
|
|
|
|
base64.store(out);
|
|
out += 16;
|
|
}
|
|
|
|
return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
|
|
srclen - i, options);
|
|
}
|
|
|
|
// --- decoding -----------------------------------------------
|
|
|
|
static simdutf_really_inline void compress(const vector_u8 data, uint16_t mask,
|
|
char *output) {
|
|
if (mask == 0) {
|
|
data.store(output);
|
|
return;
|
|
}
|
|
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
vec_u64_t tmp = {
|
|
tables::base64::thintable_epi8[mask2],
|
|
tables::base64::thintable_epi8[mask1],
|
|
};
|
|
|
|
auto shufmask = vector_u8(vec_reve(vec_u8_t(tmp)));
|
|
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
shufmask + vector_u8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
|
|
#else
|
|
vec_u64_t tmp = {
|
|
tables::base64::thintable_epi8[mask1],
|
|
tables::base64::thintable_epi8[mask2],
|
|
};
|
|
|
|
auto shufmask = vector_u8(vec_u8_t(tmp));
|
|
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
shufmask + vector_u8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
|
|
// this is the version "nearly pruned"
|
|
const auto pruned = shufmask.lookup_16(data);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
const int pop1 = tables::base64::BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
const auto compactmask =
|
|
vector_u8::load(tables::base64::pshufb_combine_table + pop1 * 8);
|
|
|
|
const auto answer = compactmask.lookup_16(pruned);
|
|
|
|
answer.store(output);
|
|
}
|
|
|
|
static simdutf_really_inline vector_u8 decoding_pack(vector_u8 input) {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
// in = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
|
|
// want = [00000000|aaaaaabb|bbbbcccc|ccdddddd]
|
|
|
|
auto in = as_vector_u16(input);
|
|
// t0 = [00??aaaa|aabbbbbb|00??cccc|ccdddddd]
|
|
const auto t0 = in.shr<2>();
|
|
const auto t1 = select(uint16_t(0x0fc0), t0, in);
|
|
|
|
// t0 = [00??????|aaaaaabb|bbbbcccc|ccdddddd]
|
|
const auto t2 = as_vector_u32(t1);
|
|
const auto t3 = t2.shr<4>();
|
|
const auto t4 = select(uint32_t(0x00fff000), t3, t2);
|
|
|
|
const auto tmp = as_vector_u8(t4);
|
|
|
|
const auto shuffle =
|
|
vector_u8(1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 0, 0, 0, 0);
|
|
|
|
const auto t = shuffle.lookup_16(tmp);
|
|
|
|
return t;
|
|
#else
|
|
// in = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
|
// want = [00000000|aaaaaabb|bbbbcccc|ccdddddd]
|
|
|
|
auto u = as_vector_u32(input).swap_bytes();
|
|
|
|
auto in = vector_u16((vec_u16_t)u.value);
|
|
// t0 = [00??aaaa|aabbbbbb|00??cccc|ccdddddd]
|
|
const auto t0 = in.shr<2>();
|
|
const auto t1 = select(uint16_t(0x0fc0), t0, in);
|
|
|
|
// t0 = [00??????|aaaaaabb|bbbbcccc|ccdddddd]
|
|
const auto t2 = as_vector_u32(t1);
|
|
const auto t3 = t2.shr<4>();
|
|
const auto t4 = select(uint32_t(0x00fff000), t3, t2);
|
|
|
|
const auto tmp = as_vector_u8(t4);
|
|
|
|
const auto shuffle =
|
|
vector_u8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0);
|
|
|
|
const auto t = shuffle.lookup_16(tmp);
|
|
|
|
return t;
|
|
#endif // SIMDUTF_IS_BIG_ENDIAN
|
|
}
|
|
static simdutf_really_inline void base64_decode(char *out, vector_u8 input) {
|
|
const auto expanded = decoding_pack(input);
|
|
expanded.store(out);
|
|
}
|
|
|
|
static simdutf_really_inline void base64_decode_block(char *out,
|
|
const char *src) {
|
|
base64_decode(out + 12 * 0, vector_u8::load(src + 0 * 16));
|
|
base64_decode(out + 12 * 1, vector_u8::load(src + 1 * 16));
|
|
base64_decode(out + 12 * 2, vector_u8::load(src + 2 * 16));
|
|
base64_decode(out + 12 * 3, vector_u8::load(src + 3 * 16));
|
|
}
|
|
|
|
static simdutf_really_inline void base64_decode_block_safe(char *out,
|
|
const char *src) {
|
|
base64_decode(out + 12 * 0, vector_u8::load(src + 0 * 16));
|
|
base64_decode(out + 12 * 1, vector_u8::load(src + 1 * 16));
|
|
base64_decode(out + 12 * 2, vector_u8::load(src + 2 * 16));
|
|
|
|
char buffer[16];
|
|
base64_decode(buffer, vector_u8::load(src + 3 * 16));
|
|
std::memcpy(out + 36, buffer, 12);
|
|
}
|
|
|
|
// ---base64 decoding::block64 class --------------------------
|
|
|
|
class block64 {
|
|
simd8x64<uint8_t> b;
|
|
|
|
public:
|
|
simdutf_really_inline block64(const char *src) : b(load_block(src)) {}
|
|
simdutf_really_inline block64(const char16_t *src) : b(load_block(src)) {}
|
|
|
|
private:
|
|
// The caller of this function is responsible to ensure that there are 64
|
|
// bytes available from reading at src. The data is read into a block64
|
|
// structure.
|
|
static simdutf_really_inline simd8x64<uint8_t> load_block(const char *src) {
|
|
const auto v0 = vector_u8::load(src + 16 * 0);
|
|
const auto v1 = vector_u8::load(src + 16 * 1);
|
|
const auto v2 = vector_u8::load(src + 16 * 2);
|
|
const auto v3 = vector_u8::load(src + 16 * 3);
|
|
|
|
return simd8x64<uint8_t>(v0, v1, v2, v3);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 128
|
|
// bytes available from reading at src. The data is read into a block64
|
|
// structure.
|
|
static simdutf_really_inline simd8x64<uint8_t>
|
|
load_block(const char16_t *src) {
|
|
const auto m1 = vector_u16::load(src + 8 * 0);
|
|
const auto m2 = vector_u16::load(src + 8 * 1);
|
|
const auto m3 = vector_u16::load(src + 8 * 2);
|
|
const auto m4 = vector_u16::load(src + 8 * 3);
|
|
const auto m5 = vector_u16::load(src + 8 * 4);
|
|
const auto m6 = vector_u16::load(src + 8 * 5);
|
|
const auto m7 = vector_u16::load(src + 8 * 6);
|
|
const auto m8 = vector_u16::load(src + 8 * 7);
|
|
|
|
return simd8x64<uint8_t>(vector_u16::pack(m1, m2), vector_u16::pack(m3, m4),
|
|
vector_u16::pack(m5, m6),
|
|
vector_u16::pack(m7, m8));
|
|
}
|
|
|
|
public:
|
|
template <bool base64_url, bool ignore_garbage>
|
|
static inline uint16_t to_base64_mask(vector_u8 &src, uint16_t &error) {
|
|
const auto ascii_space_tbl =
|
|
vector_u8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
|
|
0xc, 0xd, 0x0, 0x0);
|
|
|
|
// credit: aqrit
|
|
const auto delta_asso =
|
|
base64_url ? vector_u8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0xF, 0x0, 0xF)
|
|
: vector_u8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
|
|
|
|
const auto delta_values =
|
|
base64_url ? vector_u8(0x0, 0x0, 0x0, 0x13, 0x4, 0xBF, 0xBF, 0xB9, 0xB9,
|
|
0x0, 0x11, 0xC3, 0xBF, 0xE0, 0xB9, 0xB9)
|
|
: vector_u8(0x00, 0x00, 0x00, 0x13, 0x04, 0xBF, 0xBF, 0xB9,
|
|
0xB9, 0x00, 0x10, 0xC3, 0xBF, 0xBF, 0xB9, 0xB9);
|
|
|
|
const auto check_asso =
|
|
base64_url ? vector_u8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
|
|
0x3, 0x7, 0xB, 0xE, 0xB, 0x6)
|
|
: vector_u8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
|
|
|
|
const auto check_values =
|
|
base64_url ? vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xB6, 0xA6,
|
|
0xB5, 0xA1, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80)
|
|
: vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xD5, 0xA6,
|
|
0xB5, 0x86, 0xD1, 0x80, 0xB1, 0x80, 0x91, 0x80);
|
|
|
|
const auto shifted = src.shr<3>();
|
|
|
|
const auto delta_hash = avg(src.lookup_16(delta_asso), shifted);
|
|
const auto check_hash = avg(src.lookup_16(check_asso), shifted);
|
|
|
|
const auto out = as_vector_i8(delta_hash.lookup_16(delta_values))
|
|
.saturating_add(as_vector_i8(src));
|
|
const auto chk = as_vector_i8(check_hash.lookup_16(check_values))
|
|
.saturating_add(as_vector_i8(src));
|
|
|
|
const uint16_t mask = chk.to_bitmask();
|
|
if (!ignore_garbage && mask) {
|
|
const auto ascii = src.lookup_16(ascii_space_tbl);
|
|
const auto ascii_space = (ascii == src);
|
|
error = (mask ^ ascii_space.to_bitmask());
|
|
}
|
|
src = out;
|
|
|
|
return mask;
|
|
}
|
|
|
|
template <bool base64_url, bool ignore_garbage>
|
|
simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
|
|
uint16_t err0 = 0;
|
|
uint16_t err1 = 0;
|
|
uint16_t err2 = 0;
|
|
uint16_t err3 = 0;
|
|
uint64_t m0 = to_base64_mask<base64_url, ignore_garbage>(b.chunks[0], err0);
|
|
uint64_t m1 = to_base64_mask<base64_url, ignore_garbage>(b.chunks[1], err1);
|
|
uint64_t m2 = to_base64_mask<base64_url, ignore_garbage>(b.chunks[2], err2);
|
|
uint64_t m3 = to_base64_mask<base64_url, ignore_garbage>(b.chunks[3], err3);
|
|
|
|
if (!ignore_garbage) {
|
|
*error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
|
|
((uint64_t)err3 << 48);
|
|
}
|
|
return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline void copy_block(char *output) {
|
|
b.store(reinterpret_cast<uint8_t *>(output));
|
|
}
|
|
|
|
simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
|
|
uint64_t nmask = ~mask;
|
|
compress(b.chunks[0], uint16_t(mask), output);
|
|
compress(b.chunks[1], uint16_t(mask >> 16),
|
|
output + count_ones(nmask & 0xFFFF));
|
|
compress(b.chunks[2], uint16_t(mask >> 32),
|
|
output + count_ones(nmask & 0xFFFFFFFF));
|
|
compress(b.chunks[3], uint16_t(mask >> 48),
|
|
output + count_ones(nmask & 0xFFFFFFFFFFFFULL));
|
|
return count_ones(nmask);
|
|
}
|
|
|
|
simdutf_really_inline void base64_decode_block(char *out) {
|
|
base64_decode(out + 12 * 0, b.chunks[0]);
|
|
base64_decode(out + 12 * 1, b.chunks[1]);
|
|
base64_decode(out + 12 * 2, b.chunks[2]);
|
|
base64_decode(out + 12 * 3, b.chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline void base64_decode_block_safe(char *out) {
|
|
base64_decode(out + 12 * 0, b.chunks[0]);
|
|
base64_decode(out + 12 * 1, b.chunks[1]);
|
|
base64_decode(out + 12 * 2, b.chunks[2]);
|
|
char buffer[16];
|
|
base64_decode(buffer, b.chunks[3]);
|
|
std::memcpy(out + 12 * 3, buffer, 12);
|
|
}
|
|
};
|
|
/* end file src/ppc64/ppc64_base64.cpp */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with
|
|
// spaces
|
|
template <size_t STEP_SIZE> struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0
|
|
* (in which case this function fills the buffer with spaces and returns 0. In
|
|
* particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
|
|
* block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text_64(const uint8_t *text) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t *>(buf));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') {
|
|
buf[i] = '_';
|
|
}
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char *format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
|
|
for (size_t i = 0; i < 64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline
|
|
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
|
|
: buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
|
|
idx{0} {}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
|
|
return idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *
|
|
buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t
|
|
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if (len == idx) {
|
|
return 0;
|
|
} // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20,
|
|
STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
|
|
// to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the
|
|
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they
|
|
// ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
0b11110000u - 1,
|
|
0b11100000u - 1,
|
|
0b11000000u - 1};
|
|
const simd8<uint8_t> max_value(
|
|
&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast
|
|
// path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is
|
|
// too short or a byte value too large in the last bytes: check_special_cases
|
|
// only checks for bytes too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an
|
|
// ASCII block can't possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
|
|
if (simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete =
|
|
is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template <class checker>
|
|
bool generic_validate_utf8(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char *input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template <class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char *input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(
|
|
in + pos, size - pos, utf16_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char16_t *utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the
|
|
// generic directory.
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the
|
|
// mask far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow
|
|
// path. Anything that is not a continuation mask is a 'leading byte',
|
|
// that is, the start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end*
|
|
// of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
input + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
|
|
input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // we have an error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char32_t *utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while (pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
input + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
|
|
utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/generic/utf8.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.gt(-65);
|
|
count += count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 4;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
size_t iterations = 0;
|
|
for (; pos + 4 * N <= size; pos += 4 * N) {
|
|
const auto input0 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
|
|
const auto input1 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
|
|
const auto input2 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
|
|
const auto input3 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
|
|
const auto mask0 = input0 > int8_t(-65);
|
|
const auto mask1 = input1 > int8_t(-65);
|
|
const auto mask2 = input2 > int8_t(-65);
|
|
const auto mask3 = input3 > int8_t(-65);
|
|
|
|
local -= vector_u8(mask0);
|
|
local -= vector_u8(mask1);
|
|
local -= vector_u8(mask2);
|
|
local -= vector_u8(mask3);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
#endif // SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count +
|
|
scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
|
|
ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void
|
|
change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos < size / 32 * 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
/* begin file src/generic/validate_utf16.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf16 {
|
|
/*
|
|
UTF-16 validation
|
|
--------------------------------------------------
|
|
|
|
In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
|
|
|
|
In a vectorized algorithm we want to examine the most significant
|
|
nibble in order to select a fast path. If none of highest nibbles
|
|
are 0xD (13), than we are sure that UTF-16 chunk in a vector
|
|
register is valid.
|
|
|
|
Let us analyze what we need to check if the nibble is 0xD. The
|
|
value of the preceding nibble determines what we have:
|
|
|
|
0xd000 .. 0xd7ff - a valid word
|
|
0xd800 .. 0xdbff - low surrogate
|
|
0xdc00 .. 0xdfff - high surrogate
|
|
|
|
Other constraints we have to consider:
|
|
- there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
|
|
- there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
|
|
- there must not be sole low surrogate nor high surrogate
|
|
|
|
We are going to build three bitmasks based on the 3rd nibble:
|
|
- V = valid word,
|
|
- L = low surrogate (0xd800 .. 0xdbff)
|
|
- H = high surrogate (0xdc00 .. 0xdfff)
|
|
|
|
0 1 2 3 4 5 6 7 <--- word index
|
|
[ V | L | H | L | H | V | V | L ]
|
|
1 0 0 0 0 1 1 0 - V = valid masks
|
|
0 1 0 1 0 0 0 1 - L = low surrogate
|
|
0 0 1 0 1 0 0 0 - H high surrogate
|
|
|
|
|
|
1 0 0 0 0 1 1 0 V = valid masks
|
|
0 1 0 1 0 0 0 0 a = L & (H >> 1)
|
|
0 0 1 0 1 0 0 0 b = a << 1
|
|
1 1 1 1 1 1 1 0 c = V | a | b
|
|
^
|
|
the last bit can be zero, we just consume 7
|
|
code units and recheck this word in the next iteration
|
|
*/
|
|
template <endianness big_endian>
|
|
const result validate_utf16_with_errors(const char16_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char16_t *start = input;
|
|
const char16_t *end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 =
|
|
simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
// Function `utf16_gather_high_bytes` consumes two vectors of UTF-16
|
|
// and yields a single vector having only higher bytes of characters.
|
|
const auto in = utf16_gather_high_bytes<big_endian>(in0, in1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0000) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher byte)
|
|
|
|
// V - non-surrogate code units
|
|
// V = not surrogates_wordmask
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(
|
|
L & (H >> 1)); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint16_t b = static_cast<uint16_t>(
|
|
a << 1); // Just mark that the opinput - startite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint16_t c = static_cast<uint16_t>(
|
|
V | a | b); // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single code units or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
// The 15 lower code units of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/generic/utf32.h */
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
template <typename T> T min(T a, T b) { return a <= b ? a : b; }
|
|
|
|
simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
|
|
size_t length) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const char32_t *start = input;
|
|
|
|
// we add up to three ones in a single iteration (see the vectorized loop in
|
|
// section #2 below)
|
|
const size_t max_increment = 3;
|
|
|
|
const size_t N = vector_u32::ELEMENTS;
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
const auto v_0000007f = vector_u32::splat(0x0000007f);
|
|
const auto v_000007ff = vector_u32::splat(0x000007ff);
|
|
const auto v_0000ffff = vector_u32::splat(0x0000ffff);
|
|
#else
|
|
const auto v_ffffff80 = vector_u32::splat(0xffffff80);
|
|
const auto v_fffff800 = vector_u32::splat(0xfffff800);
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
const auto one = vector_u32::splat(1);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
size_t counter = 0;
|
|
|
|
// 1. vectorized loop unrolled 4 times
|
|
{
|
|
// we use vector of uint32 counters, this is why this limit is used
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / (max_increment * 4);
|
|
size_t blocks = length / (N * 4);
|
|
length -= blocks * (N * 4);
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
simd32<uint32_t> acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in0 = vector_u32(input + 0 * N);
|
|
const auto in1 = vector_u32(input + 1 * N);
|
|
const auto in2 = vector_u32(input + 2 * N);
|
|
const auto in3 = vector_u32(input + 3 * N);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in0 > v_0000007f);
|
|
acc -= as_vector_u32(in1 > v_0000007f);
|
|
acc -= as_vector_u32(in2 > v_0000007f);
|
|
acc -= as_vector_u32(in3 > v_0000007f);
|
|
|
|
acc -= as_vector_u32(in0 > v_000007ff);
|
|
acc -= as_vector_u32(in1 > v_000007ff);
|
|
acc -= as_vector_u32(in2 > v_000007ff);
|
|
acc -= as_vector_u32(in3 > v_000007ff);
|
|
|
|
acc -= as_vector_u32(in0 > v_0000ffff);
|
|
acc -= as_vector_u32(in1 > v_0000ffff);
|
|
acc -= as_vector_u32(in2 > v_0000ffff);
|
|
acc -= as_vector_u32(in3 > v_0000ffff);
|
|
#else
|
|
acc += min(one, in0 & v_ffffff80);
|
|
acc += min(one, in1 & v_ffffff80);
|
|
acc += min(one, in2 & v_ffffff80);
|
|
acc += min(one, in3 & v_ffffff80);
|
|
|
|
acc += min(one, in0 & v_fffff800);
|
|
acc += min(one, in1 & v_fffff800);
|
|
acc += min(one, in2 & v_fffff800);
|
|
acc += min(one, in3 & v_fffff800);
|
|
|
|
acc += min(one, in0 & v_ffff0000);
|
|
acc += min(one, in1 & v_ffff0000);
|
|
acc += min(one, in2 & v_ffff0000);
|
|
acc += min(one, in3 & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += 4 * N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
// 2. vectorized loop for tail
|
|
{
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / max_increment;
|
|
size_t blocks = length / N;
|
|
length -= blocks * N;
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
auto acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in = vector_u32(input);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in > v_0000007f);
|
|
acc -= as_vector_u32(in > v_000007ff);
|
|
acc -= as_vector_u32(in > v_0000ffff);
|
|
#else
|
|
acc += min(one, in & v_ffffff80);
|
|
acc += min(one, in & v_fffff800);
|
|
acc += min(one, in & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
if (consumed != 0) {
|
|
// We don't count 0th bytes in the vectorized loops above, this
|
|
// is why we need to count them in the end.
|
|
counter += consumed;
|
|
}
|
|
|
|
return counter + scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf32.h */
|
|
/* begin file src/generic/validate_utf32.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
simdutf_really_inline bool validate(const char32_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
// empty input is valid UTF-32. protect the implementation from
|
|
// handling nullptr
|
|
return true;
|
|
}
|
|
|
|
const char32_t *end = input + size;
|
|
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const auto standardmax = vector_u32::splat(0x10ffff);
|
|
const auto offset = vector_u32::splat(0xffff2000);
|
|
const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
|
|
auto currentmax = vector_u32::zero();
|
|
auto currentoffsetmax = vector_u32::zero();
|
|
|
|
constexpr size_t N = vector_u32::ELEMENTS;
|
|
|
|
while (input + N < end) {
|
|
auto in = vector_u32(input);
|
|
if (!match_system(endianness::BIG)) {
|
|
in.swap_bytes();
|
|
}
|
|
|
|
currentmax = max(currentmax, in);
|
|
currentoffsetmax = max(currentoffsetmax, in + offset);
|
|
input += N;
|
|
}
|
|
|
|
const auto too_large = currentmax > standardmax;
|
|
if (too_large.any()) {
|
|
return false;
|
|
}
|
|
|
|
const auto surrogate = currentoffsetmax > standardoffsetmax;
|
|
if (surrogate.any()) {
|
|
return false;
|
|
}
|
|
|
|
return scalar::utf32::validate(input, end - input);
|
|
}
|
|
|
|
simdutf_really_inline result validate_with_errors(const char32_t *input,
|
|
size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
// empty input is valid UTF-32. protect the implementation from
|
|
// handling nullptr
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char32_t *start = input;
|
|
const char32_t *end = input + size;
|
|
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const auto standardmax = vector_u32::splat(0x10ffff + 1);
|
|
const auto surrogate_mask = vector_u32::splat(0xfffff800);
|
|
const auto surrogate_byte = vector_u32::splat(0x0000d800);
|
|
|
|
constexpr size_t N = vector_u32::ELEMENTS;
|
|
|
|
while (input + N < end) {
|
|
auto in = vector_u32(input);
|
|
if (!match_system(endianness::BIG)) {
|
|
in.swap_bytes();
|
|
}
|
|
|
|
const auto too_large = in >= standardmax;
|
|
const auto surrogate = (in & surrogate_mask) == surrogate_byte;
|
|
|
|
const auto combined = too_large | surrogate;
|
|
if (simdutf_unlikely(combined.any())) {
|
|
const size_t consumed = input - start;
|
|
auto sr = scalar::utf32::validate_with_errors(input, end - input);
|
|
sr.count += consumed;
|
|
|
|
return sr;
|
|
}
|
|
|
|
input += N;
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
auto sr = scalar::utf32::validate_with_errors(input, end - input);
|
|
sr.count += consumed;
|
|
|
|
return sr;
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/generic/ascii_validation.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace ascii_validation {
|
|
|
|
bool generic_validate_ascii(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
} // namespace ascii_validation
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/ascii_validation.h */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// For UTF-8 to Latin 1, we can allow any ASCII character, and any
|
|
// continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
|
|
// 0b11000010 and nothing else.
|
|
//
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
constexpr const uint8_t FORBIDDEN = 0xff;
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
FORBIDDEN);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
FORBIDDEN,
|
|
// ____0101 ________
|
|
FORBIDDEN,
|
|
// ____011_ ________
|
|
FORBIDDEN, FORBIDDEN,
|
|
|
|
// ____1___ ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
|
|
// ____1101 ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
this->error |= check_special_cases(input, prev1);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 16; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
latin1_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin1_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t convert_valid(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
|
|
// 16 bytes, and if the data is valid, then it is entirely safe because 16
|
|
// UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
|
|
// assume that you have valid UTF-8 input, so we are going to go back from the
|
|
// end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
|
|
latin1_output);
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
// namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/generic/base64.h */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace base64 {
|
|
|
|
/*
|
|
The following template function implements API for Base64 decoding.
|
|
|
|
An implementation is responsible for providing the `block64` type and
|
|
associated methods that perform actual conversion. Please refer
|
|
to any vectorized implementation to learn the API of these procedures.
|
|
*/
|
|
template <bool base64_url, bool ignore_garbage, typename chartype>
|
|
full_result
|
|
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
size_t equallocation =
|
|
srclen; // location of the first padding character if any
|
|
// skip trailing spaces
|
|
while (!ignore_garbage && srclen > 0 &&
|
|
scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
size_t equalsigns = 0;
|
|
if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 1;
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 2;
|
|
}
|
|
}
|
|
if (srclen == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
char *end_of_safe_64byte_zone =
|
|
(srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
|
|
|
|
const chartype *const srcinit = src;
|
|
const char *const dstinit = dst;
|
|
const chartype *const srcend = src + srclen;
|
|
|
|
constexpr size_t block_size = 6;
|
|
static_assert(block_size >= 2, "block_size must be at least two");
|
|
char buffer[block_size * 64];
|
|
char *bufferptr = buffer;
|
|
if (srclen >= 64) {
|
|
const chartype *const srcend64 = src + srclen - 64;
|
|
while (src <= srcend64) {
|
|
block64 b(src);
|
|
src += 64;
|
|
uint64_t error = 0;
|
|
const uint64_t badcharmask =
|
|
b.to_base64_mask<base64_url, ignore_garbage>(&error);
|
|
if (!ignore_garbage && error) {
|
|
src -= 64;
|
|
const size_t error_offset = trailing_zeroes(error);
|
|
return {error_code::INVALID_BASE64_CHARACTER,
|
|
size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
|
|
}
|
|
if (badcharmask != 0) {
|
|
bufferptr += b.compress_block(badcharmask, bufferptr);
|
|
} else if (bufferptr != buffer) {
|
|
b.copy_block(bufferptr);
|
|
bufferptr += 64;
|
|
} else {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
b.base64_decode_block_safe(dst);
|
|
} else {
|
|
b.base64_decode_block(dst);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if (bufferptr >= (block_size - 1) * 64 + buffer) {
|
|
for (size_t i = 0; i < (block_size - 2); i++) {
|
|
base64_decode_block(dst, buffer + i * 64);
|
|
dst += 48;
|
|
}
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
|
|
} else {
|
|
base64_decode_block(dst, buffer + (block_size - 2) * 64);
|
|
}
|
|
dst += 48;
|
|
std::memcpy(buffer, buffer + (block_size - 1) * 64,
|
|
64); // 64 might be too much
|
|
bufferptr -= (block_size - 1) * 64;
|
|
}
|
|
}
|
|
}
|
|
|
|
char *buffer_start = buffer;
|
|
// Optimization note: if this is almost full, then it is worth our
|
|
// time, otherwise, we should just decode directly.
|
|
int last_block = (int)((bufferptr - buffer_start) % 64);
|
|
if (last_block != 0 && srcend - src + last_block >= 64) {
|
|
|
|
while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
|
|
uint8_t val = to_base64[uint8_t(*src)];
|
|
*bufferptr = char(val);
|
|
if (!ignore_garbage &&
|
|
(!scalar::base64::is_eight_byte(*src) || val > 64)) {
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
bufferptr += (val <= 63);
|
|
src++;
|
|
}
|
|
}
|
|
|
|
for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer_start);
|
|
} else {
|
|
base64_decode_block(dst, buffer_start);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if ((bufferptr - buffer_start) % 64 != 0) {
|
|
while (buffer_start + 4 < bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
#endif
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
if (buffer_start + 4 <= bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
#endif
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
// we may have 1, 2 or 3 bytes left and we need to decode them so let us
|
|
// backtrack
|
|
int leftover = int(bufferptr - buffer_start);
|
|
while (leftover > 0) {
|
|
if (!ignore_garbage) {
|
|
while (to_base64[uint8_t(*(src - 1))] == 64) {
|
|
src--;
|
|
}
|
|
} else {
|
|
while (to_base64[uint8_t(*(src - 1))] >= 64) {
|
|
src--;
|
|
}
|
|
}
|
|
src--;
|
|
leftover--;
|
|
}
|
|
}
|
|
if (src < srcend + equalsigns) {
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
dst, src, srcend - src, equalsigns, options, last_chunk_options);
|
|
r.input_count += size_t(src - srcinit);
|
|
if (r.error == error_code::INVALID_BASE64_CHARACTER ||
|
|
r.error == error_code::BASE64_EXTRA_BITS) {
|
|
return r;
|
|
} else {
|
|
r.output_count += size_t(dst - dstinit);
|
|
}
|
|
if (!ignore_garbage && last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
r.error = error_code::INVALID_BASE64_CHARACTER;
|
|
r.input_count = equallocation;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if ((size_t(dst - dstinit) % 3 == 0) ||
|
|
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
|
|
} // namespace base64
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/base64.h */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
/* begin file src/ppc64/templates.cpp */
|
|
/*
|
|
Template `convert_impl` implements generic conversion routine between
|
|
different encodings. Procedure returns the number of written elements,
|
|
or zero in the case of error.
|
|
|
|
Parameters:
|
|
* VectorizedConvert - vectorized procedure that returns structure having
|
|
three fields: error_code (err), const Source* (input), Destination*
|
|
(output)
|
|
* ScalarConvert - scalar procedure that carries on conversion of tail
|
|
* Source - type of input char (like char16_t, char)
|
|
* Destination - type of input char
|
|
*/
|
|
template <typename VectorizedConvert, typename ScalarConvert, typename Source,
|
|
typename Destination>
|
|
size_t convert_impl(VectorizedConvert vectorized_convert,
|
|
ScalarConvert scalar_convert, const Source *buf, size_t len,
|
|
Destination *output) {
|
|
const auto vr = vectorized_convert(buf, len, output);
|
|
const size_t consumed = vr.input - buf;
|
|
const size_t written = vr.output - output;
|
|
if (vr.err != simdutf::error_code::SUCCESS) {
|
|
if (vr.err == simdutf::error_code::OTHER) {
|
|
// Vectorized procedure detected an error, but does not know
|
|
// exact position. The scalar procedure rescan the portion of
|
|
// input and figure out where the error is located.
|
|
return scalar_convert(vr.input, len - consumed, vr.output);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
if (consumed == len) {
|
|
return written;
|
|
}
|
|
|
|
const auto ret = scalar_convert(vr.input, len - consumed, vr.output);
|
|
if (ret == 0) {
|
|
return 0;
|
|
}
|
|
|
|
return written + ret;
|
|
}
|
|
|
|
/*
|
|
Template `convert_with_errors_impl` implements generic conversion routine
|
|
between different encodings. Procedure returns a `result` instance ---
|
|
please refer to its documentation for details.
|
|
|
|
Parameters:
|
|
* VectorizedConvert - vectorized procedure that returns structure having
|
|
three fields: error_code (err), const Source* (input), Destination*
|
|
(output)
|
|
* ScalarConvert - scalar procedure that carries on conversion of tail
|
|
* Source - type of input char (like char16_t, char)
|
|
* Destination - type of input char
|
|
*/
|
|
template <typename VectorizedConvert, typename ScalarConvert, typename Source,
|
|
typename Destination>
|
|
simdutf::result convert_with_errors_impl(VectorizedConvert vectorized_convert,
|
|
ScalarConvert scalar_convert,
|
|
const Source *buf, size_t len,
|
|
Destination *output) {
|
|
|
|
const auto vr = vectorized_convert(buf, len, output);
|
|
const size_t consumed = vr.input - buf;
|
|
const size_t written = vr.output - output;
|
|
if (vr.err != simdutf::error_code::SUCCESS) {
|
|
if (vr.err == simdutf::error_code::OTHER) {
|
|
// Vectorized procedure detected an error, but does not know
|
|
// exact position. The scalar procedure rescan the portion of
|
|
// input and figure out where the error is located.
|
|
auto sr = scalar_convert(vr.input, len - consumed, vr.output);
|
|
sr.count += consumed;
|
|
return sr;
|
|
}
|
|
return simdutf::result(vr.err, consumed);
|
|
}
|
|
|
|
if (consumed == len) {
|
|
return simdutf::result(simdutf::error_code::SUCCESS, written);
|
|
}
|
|
|
|
simdutf::result sr = scalar_convert(vr.input, len - consumed, vr.output);
|
|
if (sr.is_ok()) {
|
|
sr.count += written;
|
|
} else {
|
|
sr.count += consumed;
|
|
}
|
|
|
|
return sr;
|
|
}
|
|
/* end file src/ppc64/templates.cpp */
|
|
|
|
#ifdef SIMDUTF_INTERNAL_TESTS
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
#include "ppc64_base64_internal_tests.cpp"
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
#endif // SIMDUTF_INTERNAL_TESTS
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
int out = 0;
|
|
// todo: reimplement as a one-pass algorithm.
|
|
if (validate_utf8(input, length)) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if ((length % 2) == 0) {
|
|
if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
|
|
length / 2)) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
}
|
|
if ((length % 4) == 0) {
|
|
if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return ppc64::utf8_validation::generic_validate_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return ppc64::ascii_validation::generic_validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return ppc64::ascii_validation::generic_validate_ascii_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
const auto res =
|
|
ppc64::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count != len) {
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
return validate_utf16be_with_errors(buf, len).is_ok();
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(input, len,
|
|
output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::BIG>(input, len,
|
|
output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
const auto res =
|
|
ppc64::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
auto scalar = scalar::utf16::validate_with_errors<endianness::LITTLE>(
|
|
buf + res.count, len - res.count);
|
|
scalar.count += res.count;
|
|
return scalar;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
const auto res =
|
|
ppc64::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
auto scalar = scalar::utf16::validate_with_errors<endianness::BIG>(
|
|
buf + res.count, len - res.count);
|
|
scalar.count += res.count;
|
|
return scalar;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
return utf32::validate(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
return utf32::validate_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
const auto ret = ppc64_convert_latin1_to_utf8(buf, len, utf8_output);
|
|
size_t converted_chars = ret.second - utf8_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
size_t n =
|
|
ppc64_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (n < len) {
|
|
n += scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf + n, len - n,
|
|
utf16_output + n);
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
size_t n =
|
|
ppc64_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (n < len) {
|
|
n += scalar::latin1_to_utf16::convert<endianness::BIG>(buf + n, len - n,
|
|
utf16_output + n);
|
|
}
|
|
|
|
return n;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
const auto ret = ppc64_convert_latin1_to_utf32(buf, len, utf32_output);
|
|
if (ret.first != buf + len) {
|
|
const size_t processed = ret.first - buf;
|
|
scalar::latin1_to_utf32::convert(ret.first, len - processed, ret.second);
|
|
}
|
|
|
|
return len;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return ppc64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *input, size_t size, char32_t *utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
|
|
return convert_impl(ppc64_convert_utf16_to_latin1<endianness::LITTLE>,
|
|
scalar::utf16_to_latin1::convert<endianness::LITTLE>, buf,
|
|
len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
|
|
return convert_impl(ppc64_convert_utf16_to_latin1<endianness::BIG>,
|
|
scalar::utf16_to_latin1::convert<endianness::BIG>, buf,
|
|
len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf16_to_latin1<endianness::LITTLE>,
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>, buf,
|
|
len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf16_to_latin1<endianness::BIG>,
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>, buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: we could provide an optimized function.
|
|
return convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: we could provide an optimized function.
|
|
return convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
|
|
return convert_impl(ppc64_convert_utf16_to_utf8<endianness::LITTLE>,
|
|
scalar::utf16_to_utf8::convert<endianness::LITTLE>, buf,
|
|
len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
|
|
return convert_impl(ppc64_convert_utf16_to_utf8<endianness::BIG>,
|
|
scalar::utf16_to_utf8::convert<endianness::BIG>, buf, len,
|
|
utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf16_to_utf8<endianness::LITTLE>,
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>, buf, len,
|
|
utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf16_to_utf8<endianness::BIG>,
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>, buf, len,
|
|
utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return convert_impl(ppc64_convert_utf32_to_latin1<ErrorChecking::enabled>,
|
|
scalar::utf32_to_latin1::convert, buf, len,
|
|
latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf32_to_latin1<ErrorChecking::enabled>,
|
|
scalar::utf32_to_latin1::convert_with_errors, buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
return convert_impl(ppc64_convert_utf32_to_latin1<ErrorChecking::disabled>,
|
|
scalar::utf32_to_latin1::convert, buf, len,
|
|
latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_impl(ppc64_convert_utf32_to_utf8<ErrorReporting::at_the_end>,
|
|
scalar::utf32_to_utf8::convert, buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf32_to_utf8<ErrorReporting::precise>,
|
|
scalar::utf32_to_utf8::convert_with_errors, buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_impl(ppc64_convert_utf32_to_utf8<ErrorReporting::none>,
|
|
scalar::utf32_to_utf8::convert, buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
|
|
return convert_impl(ppc64_convert_utf32_to_utf16<endianness::LITTLE,
|
|
ErrorReporting::at_the_end>,
|
|
scalar::utf32_to_utf16::convert<endianness::LITTLE>, buf,
|
|
len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
|
|
return convert_impl(
|
|
ppc64_convert_utf32_to_utf16<endianness::BIG, ErrorReporting::at_the_end>,
|
|
scalar::utf32_to_utf16::convert<endianness::BIG>, buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf32_to_utf16<endianness::LITTLE, ErrorReporting::precise>,
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>, buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf32_to_utf16<endianness::BIG, ErrorReporting::precise>,
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>, buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
|
|
return convert_impl(
|
|
ppc64_convert_utf32_to_utf16<endianness::LITTLE, ErrorReporting::none>,
|
|
scalar::utf32_to_utf16::convert<endianness::LITTLE>, buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
|
|
return convert_impl(
|
|
ppc64_convert_utf32_to_utf16<endianness::BIG, ErrorReporting::none>,
|
|
scalar::utf32_to_utf16::convert<endianness::BIG>, buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_impl(ppc64_convert_utf16_to_utf32<endianness::LITTLE>,
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>, buf,
|
|
len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_impl(ppc64_convert_utf16_to_utf32<endianness::BIG>,
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>, buf,
|
|
len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf16_to_utf32<endianness::LITTLE>,
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>, buf, len,
|
|
utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_with_errors_impl(
|
|
ppc64_convert_utf16_to_utf32<endianness::BIG>,
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>, buf, len,
|
|
utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t length) const noexcept {
|
|
const auto ret = ppc64_utf8_length_from_latin1(input, length);
|
|
const size_t consumed = ret.first - input;
|
|
|
|
if (consumed == length) {
|
|
return ret.second;
|
|
}
|
|
|
|
const auto scalar =
|
|
scalar::latin1::utf8_length_from_latin1(ret.first, length - consumed);
|
|
return scalar + ret.second;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return scalar::utf32::utf16_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
|
|
const char *input, size_t length) const noexcept {
|
|
return scalar::base64::maximal_binary_length_from_base64(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
if (options & base64_url) {
|
|
return encode_base64<true>(output, input, length, options);
|
|
} else {
|
|
return encode_base64<false>(output, input, length, options);
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
#ifdef SIMDUTF_INTERNAL_TESTS
|
|
std::vector<implementation::TestProcedure>
|
|
implementation::internal_tests() const {
|
|
#define entry(proc) \
|
|
TestProcedure { #proc, proc }
|
|
return {entry(base64_encoding_translate_6bit_values),
|
|
entry(base64_encoding_expand_6bit_fields),
|
|
entry(base64_decoding_valid),
|
|
entry(base64_decoding_invalid_ignore_errors),
|
|
entry(base64url_decoding_invalid_ignore_errors),
|
|
entry(base64_decoding_invalid_strict_errors),
|
|
entry(base64url_decoding_invalid_strict_errors),
|
|
entry(base64_decoding_pack),
|
|
entry(base64_compress)};
|
|
#undef entry
|
|
}
|
|
#endif
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/ppc64/end.h */
|
|
/* end file src/simdutf/ppc64/end.h */
|
|
/* end file src/ppc64/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_RVV
|
|
/* begin file src/rvv/implementation.cpp */
|
|
/* begin file src/simdutf/rvv/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "rvv"
|
|
// #define SIMDUTF_IMPLEMENTATION rvv
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_RVV
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_RVV
|
|
#endif
|
|
/* end file src/simdutf/rvv/begin.h */
|
|
namespace simdutf {
|
|
namespace rvv {
|
|
namespace {
|
|
#ifndef SIMDUTF_RVV_H
|
|
#error "rvv.h must be included"
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace rvv
|
|
} // namespace simdutf
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdutf {
|
|
namespace rvv {
|
|
/* begin file src/rvv/rvv_helpers.inl.cpp */
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static size_t
|
|
rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl,
|
|
vbool4_t m4even) {
|
|
/* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
|
|
* to [110111bbbbbbbbbb|110110aaaaaaaaaa] */
|
|
vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
|
|
sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
|
|
__riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
|
|
sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
|
|
sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
|
|
/* merge 1 byte utf32 and 2 byte sur */
|
|
vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
|
|
vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(
|
|
__riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
|
|
/* compress and store */
|
|
vbool4_t mOut = __riscv_vmor_mm_b4(
|
|
__riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2);
|
|
vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2);
|
|
vl = __riscv_vcpop_m_b4(mOut, vl * 2);
|
|
__riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
|
|
return vl;
|
|
};
|
|
/* end file src/rvv/rvv_helpers.inl.cpp */
|
|
|
|
/* begin file src/rvv/rvv_length_from.inl.cpp */
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf16le(const char16_t *src, size_t len) const noexcept {
|
|
return utf32_length_from_utf16le(src, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf16be(const char16_t *src, size_t len) const noexcept {
|
|
return utf32_length_from_utf16be(src, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *src, size_t len) const noexcept {
|
|
return utf32_length_from_utf8(src, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *src, size_t len) const noexcept {
|
|
return utf32_length_from_utf8(src, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *src, size_t len) const noexcept {
|
|
size_t count = 0;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e8m8(len);
|
|
vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
|
|
vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
|
|
count += __riscv_vcpop_m_b1(mask, vl);
|
|
}
|
|
return count;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static size_t
|
|
rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
|
|
size_t count = 0;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e16m8(len);
|
|
vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
|
|
v = simdutf_byteflip<bflip>(v, vl);
|
|
vbool2_t notHigh =
|
|
__riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
|
|
__riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
|
|
count += __riscv_vcpop_m_b2(notHigh, vl);
|
|
}
|
|
return count;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *src, size_t len) const noexcept {
|
|
return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *src, size_t len) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
|
|
else
|
|
return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *src, size_t len) const noexcept {
|
|
size_t count = len;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e8m8(len);
|
|
vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
|
|
count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
|
|
}
|
|
return count;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static size_t
|
|
rvv_utf8_length_from_utf16(const char16_t *src, size_t len) {
|
|
size_t count = 0;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e16m8(len);
|
|
vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
|
|
v = simdutf_byteflip<bflip>(v, vl);
|
|
vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl);
|
|
vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl);
|
|
vbool2_t notSur =
|
|
__riscv_vmor_mm_b2(__riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl),
|
|
__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl);
|
|
vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl);
|
|
count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl);
|
|
}
|
|
return count;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *src, size_t len) const noexcept {
|
|
return rvv_utf8_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *src, size_t len) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_utf8_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
|
|
else
|
|
return rvv_utf8_length_from_utf16<simdutf_ByteFlip::V>(src, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *src, size_t len) const noexcept {
|
|
size_t count = 0;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e32m8(len);
|
|
vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
|
|
vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
|
|
vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
|
|
vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
|
|
count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) +
|
|
__riscv_vcpop_m_b4(m4, vl);
|
|
}
|
|
return count;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *src, size_t len) const noexcept {
|
|
size_t count = 0;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e8m8(len);
|
|
vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
|
|
vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
|
|
vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(__riscv_vreinterpret_u8m8(v),
|
|
(uint8_t)0b11101111, vl);
|
|
count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl);
|
|
}
|
|
return count;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *src, size_t len) const noexcept {
|
|
size_t count = 0;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e32m8(len);
|
|
vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
|
|
vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
|
|
count += vl + __riscv_vcpop_m_b4(m4, vl);
|
|
}
|
|
return count;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* end file src/rvv/rvv_length_from.inl.cpp */
|
|
/* begin file src/rvv/rvv_validate.inl.cpp */
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *src, size_t len) const noexcept {
|
|
size_t vlmax = __riscv_vsetvlmax_e8m8();
|
|
vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e8m8(len);
|
|
vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
|
|
mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
|
|
}
|
|
return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) <
|
|
0;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *src, size_t len) const noexcept {
|
|
const char *beg = src;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e8m8(len);
|
|
vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
|
|
long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
|
|
if (idx >= 0)
|
|
return result(error_code::TOO_LARGE, src - beg + idx);
|
|
}
|
|
return result(error_code::SUCCESS, src - beg);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* Returns a close estimation of the number of valid UTF-8 bytes up to the
|
|
* first invalid one, but never overestimating. */
|
|
simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
|
|
size_t len) {
|
|
const char *beg = src;
|
|
if (len < 32)
|
|
return 0;
|
|
|
|
/* validate first three bytes */
|
|
{
|
|
size_t idx = 3;
|
|
while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10)
|
|
++idx;
|
|
if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
|
|
return 0;
|
|
}
|
|
|
|
static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
|
|
static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
|
|
static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
|
|
|
|
const vuint8m1_t err1tbl =
|
|
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
|
|
const vuint8m1_t err2tbl =
|
|
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
|
|
const vuint8m1_t err3tbl =
|
|
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
|
|
|
|
size_t tail = 3;
|
|
size_t n = len - tail;
|
|
|
|
for (size_t vl; n > 0; n -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e8m4(n);
|
|
vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl);
|
|
|
|
uint8_t next0 = src[vl + 0];
|
|
uint8_t next1 = src[vl + 1];
|
|
uint8_t next2 = src[vl + 2];
|
|
|
|
/* fast path: ASCII */
|
|
if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) <
|
|
0 &&
|
|
(next0 | next1 | next2) < 0b10000000)
|
|
continue;
|
|
|
|
/* see "Validating UTF-8 In Less Than One Instruction Per Byte"
|
|
* https://arxiv.org/abs/2010.03090 */
|
|
vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl);
|
|
vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl);
|
|
vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, next2, vl);
|
|
|
|
vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
|
|
vuint8m4_t idx1 = __riscv_vsrl_vx_u8m4(v2, 4, vl);
|
|
vuint8m4_t idx3 = __riscv_vsrl_vx_u8m4(v3, 4, vl);
|
|
|
|
vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
|
|
vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
|
|
vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
|
|
vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(
|
|
__riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));
|
|
|
|
vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl);
|
|
vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl);
|
|
vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
|
|
vbool2_t err34 =
|
|
__riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
|
|
vbool2_t errm =
|
|
__riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
|
|
if (__riscv_vfirst_m_b2(errm, vl) >= 0)
|
|
break;
|
|
}
|
|
|
|
/* we need to validate the last character */
|
|
while (tail < len && (uint8_t(src[0]) >> 6) == 0b10)
|
|
--src, ++tail;
|
|
return src - beg;
|
|
}
|
|
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *src, size_t len) const noexcept {
|
|
size_t count = rvv_count_valid_utf8(src, len);
|
|
return scalar::utf8::validate(src + count, len - count);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *src, size_t len) const noexcept {
|
|
size_t count = rvv_count_valid_utf8(src, len);
|
|
result res = scalar::utf8::validate_with_errors(src + count, len - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static result
|
|
rvv_validate_utf16_with_errors(const char16_t *src, size_t len) {
|
|
const char16_t *beg = src;
|
|
uint16_t last = 0;
|
|
for (size_t vl; len > 0;
|
|
len -= vl, src += vl, last = simdutf_byteflip<bflip>(src[-1])) {
|
|
vl = __riscv_vsetvl_e16m8(len);
|
|
vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t *)src, vl);
|
|
v1 = simdutf_byteflip<bflip>(v1, vl);
|
|
vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl);
|
|
|
|
vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(
|
|
__riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl);
|
|
vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(
|
|
__riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl);
|
|
|
|
long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl);
|
|
if (idx >= 0) {
|
|
last = idx > 0 ? simdutf_byteflip<bflip>(src[idx - 1]) : last;
|
|
return result(error_code::SURROGATE,
|
|
src - beg + idx - (last - 0xD800u < 0x400u));
|
|
break;
|
|
}
|
|
}
|
|
if (last - 0xD800u < 0x400u) {
|
|
return result(error_code::SURROGATE,
|
|
src - beg - 1); /* end on high surrogate */
|
|
} else {
|
|
return result(error_code::SUCCESS, src - beg);
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *src,
|
|
size_t len) const noexcept {
|
|
return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len)
|
|
.error == error_code::SUCCESS;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *src,
|
|
size_t len) const noexcept {
|
|
return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *src, size_t len) const noexcept {
|
|
return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *src, size_t len) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_validate_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len);
|
|
else
|
|
return rvv_validate_utf16_with_errors<simdutf_ByteFlip::V>(src, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
|
|
size_t vlmax = __riscv_vsetvlmax_e32m8();
|
|
vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
|
|
vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e32m8(len);
|
|
vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
|
|
vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
|
|
max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
|
|
maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
|
|
}
|
|
return __riscv_vfirst_m_b4(
|
|
__riscv_vmor_mm_b4(
|
|
__riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
|
|
__riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax),
|
|
vlmax) < 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *src, size_t len) const noexcept {
|
|
const char32_t *beg = src;
|
|
for (size_t vl; len > 0; len -= vl, src += vl) {
|
|
vl = __riscv_vsetvl_e32m8(len);
|
|
vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
|
|
vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
|
|
long idx1 =
|
|
__riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
|
|
long idx2 = __riscv_vfirst_m_b4(
|
|
__riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
|
|
if (idx1 >= 0 && idx2 >= 0) {
|
|
if (idx1 <= idx2) {
|
|
return result(error_code::TOO_LARGE, src - beg + idx1);
|
|
} else {
|
|
return result(error_code::SURROGATE, src - beg + idx2);
|
|
}
|
|
}
|
|
if (idx1 >= 0) {
|
|
return result(error_code::TOO_LARGE, src - beg + idx1);
|
|
}
|
|
if (idx2 >= 0) {
|
|
return result(error_code::SURROGATE, src - beg + idx2);
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, src - beg);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
/* end file src/rvv/rvv_validate.inl.cpp */
|
|
|
|
/* begin file src/rvv/rvv_latin1_to.inl.cpp */
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *src, size_t len, char *dst) const noexcept {
|
|
char *beg = dst;
|
|
for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
|
|
vl = __riscv_vsetvl_e8m2(len);
|
|
vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
|
|
vbool4_t nascii =
|
|
__riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
|
|
size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
|
|
vlOut = vl + cnt;
|
|
if (cnt == 0) {
|
|
__riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
|
|
continue;
|
|
}
|
|
|
|
vuint8m2_t v0 =
|
|
__riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
|
|
v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);
|
|
|
|
vuint8m4_t wide =
|
|
__riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(
|
|
__riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
|
|
vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(
|
|
__riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2);
|
|
vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2);
|
|
|
|
__riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut);
|
|
}
|
|
return dst - beg;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
char16_t *beg = dst;
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e8m4(len);
|
|
vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
|
|
__riscv_vse16_v_u16m8((uint16_t *)dst, __riscv_vzext_vf2_u16m8(v, vl), vl);
|
|
}
|
|
return dst - beg;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
char16_t *beg = dst;
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e8m4(len);
|
|
vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
|
|
__riscv_vse16_v_u16m8(
|
|
(uint16_t *)dst,
|
|
__riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl);
|
|
}
|
|
return dst - beg;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *src, size_t len, char32_t *dst) const noexcept {
|
|
char32_t *beg = dst;
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e8m2(len);
|
|
vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
|
|
__riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
|
|
}
|
|
return dst - beg;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* end file src/rvv/rvv_latin1_to.inl.cpp */
|
|
/* begin file src/rvv/rvv_utf16_to.inl.cpp */
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static result
|
|
rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) {
|
|
const char16_t *const beg = src;
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e16m8(len);
|
|
vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
|
|
v = simdutf_byteflip<bflip>(v, vl);
|
|
long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
|
|
if (idx >= 0)
|
|
return result(error_code::TOO_LARGE, src - beg + idx);
|
|
__riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
|
|
}
|
|
return result(error_code::SUCCESS, src - beg);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
result res = convert_utf16le_to_latin1_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
result res = convert_utf16be_to_latin1_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
|
|
dst);
|
|
else
|
|
return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::V>(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
const char16_t *const beg = src;
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e16m8(len);
|
|
vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
|
|
__riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
|
|
}
|
|
return src - beg;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
const char16_t *const beg = src;
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e16m8(len);
|
|
vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
|
|
__riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl);
|
|
}
|
|
return src - beg;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static result
|
|
rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) {
|
|
size_t n = len;
|
|
const char16_t *srcBeg = src;
|
|
const char *dstBeg = dst;
|
|
size_t vl8m4 = __riscv_vsetvlmax_e8m4();
|
|
vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
|
|
__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
|
|
|
|
for (size_t vl, vlOut; n > 0;) {
|
|
vl = __riscv_vsetvl_e16m2(n);
|
|
|
|
vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
|
|
v = simdutf_byteflip<bflip>(v, vl);
|
|
vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80 - 1, vl);
|
|
|
|
if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
|
|
vlOut = vl;
|
|
__riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut),
|
|
vlOut);
|
|
n -= vl, src += vl, dst += vlOut;
|
|
continue;
|
|
}
|
|
|
|
vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800 - 1, vl);
|
|
|
|
if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
|
|
/* 0: [ aaa|aabbbbbb]
|
|
* 1: [aabbbbbb| ] vsll 8
|
|
* 2: [ | aaaaa] vsrl 6
|
|
* 3: [00111111|00011111]
|
|
* 4: [ bbbbbb|000aaaaa] (1|2)&3
|
|
* 5: [11000000|11000000]
|
|
* 6: [10bbbbbb|110aaaaa] 4|5 */
|
|
vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
|
|
__riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(v, 8, vl),
|
|
__riscv_vsrl_vx_u16m2(v, 6, vl), vl),
|
|
0b0011111100011111, vl);
|
|
vuint16m2_t vout16 =
|
|
__riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
|
|
vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
|
|
|
|
/* Every high byte that is zero should be compressed
|
|
* low bytes should never be compressed, so we set them
|
|
* to all ones, and then create a non-zero bytes mask */
|
|
vbool4_t mcomp =
|
|
__riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
|
|
__riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
|
|
0, vl * 2);
|
|
vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
|
|
|
|
vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
|
|
__riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
|
|
|
|
n -= vl, src += vl, dst += vlOut;
|
|
continue;
|
|
}
|
|
|
|
vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(
|
|
__riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
|
|
long first = __riscv_vfirst_m_b8(sur, vl);
|
|
size_t tail = vl - first;
|
|
vl = first < 0 ? vl : first;
|
|
|
|
if (vl > 0) { /* 1/2/3 byte utf8 */
|
|
/* in: [aaaabbbb|bbcccccc]
|
|
* v1: [0bcccccc| ] vsll 8
|
|
* v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000
|
|
* v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000
|
|
* v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000
|
|
* v3: [ |1110aaaa] vsrl 12 | 0b11100000
|
|
* 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
|
|
* 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
|
|
* 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
|
|
* [10cccccc]
|
|
*/
|
|
vuint16m2_t v1, v2, v3, v12;
|
|
v1 = __riscv_vor_vx_u16m2_mu(
|
|
m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
|
|
v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
|
|
|
|
v2 = __riscv_vor_vx_u16m2(
|
|
__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111,
|
|
vl),
|
|
0b10000000, vl);
|
|
v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
|
|
0b01000000, vl);
|
|
v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000,
|
|
vl);
|
|
v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
|
|
|
|
vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
|
|
vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
|
|
vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
|
|
|
|
vbool2_t mcomp = __riscv_vmor_mm_b2(
|
|
m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
|
|
vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
|
|
|
|
vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
|
|
__riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
|
|
|
|
n -= vl, src += vl, dst += vlOut;
|
|
}
|
|
|
|
if (tail)
|
|
while (n) {
|
|
uint16_t word = simdutf_byteflip<bflip>(src[0]);
|
|
if ((word & 0xFF80) == 0) {
|
|
break;
|
|
} else if ((word & 0xF800) == 0) {
|
|
break;
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
break;
|
|
} else {
|
|
// must be a surrogate pair
|
|
if (n <= 1)
|
|
return result(error_code::SURROGATE, src - srcBeg);
|
|
uint16_t diff = word - 0xD800;
|
|
if (diff > 0x3FF)
|
|
return result(error_code::SURROGATE, src - srcBeg);
|
|
uint16_t diff2 = simdutf_byteflip<bflip>(src[1]) - 0xDC00;
|
|
if (diff2 > 0x3FF)
|
|
return result(error_code::SURROGATE, src - srcBeg);
|
|
|
|
uint32_t value = ((diff + 0x40) << 10) + diff2;
|
|
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*dst++ = (char)((value >> 18) | 0b11110000);
|
|
*dst++ = (char)(((value >> 12) & 0b111111) | 0b10000000);
|
|
*dst++ = (char)(((value >> 6) & 0b111111) | 0b10000000);
|
|
*dst++ = (char)((value & 0b111111) | 0b10000000);
|
|
src += 2;
|
|
n -= 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, dst - dstBeg);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
result res = convert_utf16le_to_utf8_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
result res = convert_utf16be_to_utf8_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
|
|
else
|
|
return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::V>(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
return convert_utf16le_to_utf8(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *src, size_t len, char *dst) const noexcept {
|
|
return convert_utf16be_to_utf8(src, len, dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static result
|
|
rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) {
|
|
const char16_t *const srcBeg = src;
|
|
char32_t *const dstBeg = dst;
|
|
|
|
constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800;
|
|
constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800;
|
|
constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00;
|
|
constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00;
|
|
constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00;
|
|
constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800;
|
|
|
|
uint16_t last = 0;
|
|
while (len > 0) {
|
|
size_t vl = __riscv_vsetvl_e16m2(len);
|
|
vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
|
|
v0 = simdutf_byteflip<bflip>(v0, vl);
|
|
|
|
{ // check fast-path
|
|
const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl);
|
|
const vbool8_t any_surrogate =
|
|
__riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl);
|
|
if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) {
|
|
/* no surrogates */
|
|
__riscv_vse32_v_u32m4((uint32_t *)dst, __riscv_vzext_vf2_u32m4(v0, vl),
|
|
vl);
|
|
len -= vl;
|
|
src += vl;
|
|
dst += vl;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if ((simdutf_byteflip<bflip>(src[0]) & LO_SURROGATE_MASK) ==
|
|
LO_SURROGATE_VALUE) {
|
|
return result(error_code::SURROGATE, src - srcBeg);
|
|
}
|
|
|
|
// decode surrogates
|
|
vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl);
|
|
vl = __riscv_vsetvl_e16m2(vl - 1);
|
|
if (vl == 0) {
|
|
return result(error_code::SURROGATE, src - srcBeg);
|
|
}
|
|
|
|
const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8(
|
|
__riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE,
|
|
vl);
|
|
const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8(
|
|
__riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
|
|
vl);
|
|
|
|
// compress everything but lo surrogates
|
|
const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8(
|
|
__riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
|
|
vl);
|
|
|
|
{
|
|
const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl);
|
|
const long idx = __riscv_vfirst_m_b8(diff, vl);
|
|
if (idx >= 0) {
|
|
uint16_t word = simdutf_byteflip<bflip>(src[idx]);
|
|
if (word < 0xD800 || word > 0xDBFF) {
|
|
return result(error_code::SURROGATE, src - srcBeg + idx + 1);
|
|
}
|
|
return result(error_code::SURROGATE, src - srcBeg + idx);
|
|
}
|
|
}
|
|
|
|
last = simdutf_byteflip<bflip>(src[vl]);
|
|
vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl);
|
|
|
|
// v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate
|
|
// v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate
|
|
|
|
// t0 = u16( 0000_00yy_yyyy_yyyy)
|
|
const vuint32m4_t t0 =
|
|
__riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl);
|
|
// t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000)
|
|
const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl);
|
|
|
|
// t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx)
|
|
const vuint32m4_t t2 =
|
|
__riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl);
|
|
|
|
// t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx)
|
|
const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl);
|
|
|
|
// t4 = utf32 from surrogate pairs
|
|
const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl);
|
|
|
|
const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl);
|
|
|
|
const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl);
|
|
const size_t vlOut = __riscv_vcpop_m_b8(compress, vl);
|
|
__riscv_vse32_v_u32m4((uint32_t *)dst, comp, vlOut);
|
|
|
|
len -= vl;
|
|
src += vl;
|
|
dst += vlOut;
|
|
|
|
if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) {
|
|
// last item is lo surrogate and got already consumed
|
|
len -= 1;
|
|
src += 1;
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, dst - dstBeg);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *src, size_t len, char32_t *dst) const noexcept {
|
|
result res = convert_utf16le_to_utf32_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *src, size_t len, char32_t *dst) const noexcept {
|
|
result res = convert_utf16be_to_utf32_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *src, size_t len, char32_t *dst) const noexcept {
|
|
return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *src, size_t len, char32_t *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
|
|
dst);
|
|
else
|
|
return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::V>(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *src, size_t len, char32_t *dst) const noexcept {
|
|
return convert_utf16le_to_utf32(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *src, size_t len, char32_t *dst) const noexcept {
|
|
return convert_utf16be_to_utf32(src, len, dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* end file src/rvv/rvv_utf16_to.inl.cpp */
|
|
|
|
/* begin file src/rvv/rvv_utf32_to.inl.cpp */
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *src, size_t len, char *dst) const noexcept {
|
|
result res = convert_utf32_to_latin1_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *src, size_t len, char *dst) const noexcept {
|
|
const char32_t *const beg = src;
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e32m8(len);
|
|
vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
|
|
long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
|
|
if (idx >= 0)
|
|
return result(error_code::TOO_LARGE, src - beg + idx);
|
|
/* We don't use vcompress here, because its performance varies widely on
|
|
* current platforms. This might be worth reconsidering once there is more
|
|
* hardware available. */
|
|
__riscv_vse8_v_u8m2(
|
|
(uint8_t *)dst,
|
|
__riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
|
|
}
|
|
return result(error_code::SUCCESS, src - beg);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *src, size_t len, char *dst) const noexcept {
|
|
return convert_utf32_to_latin1(src, len, dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *src, size_t len, char *dst) const noexcept {
|
|
size_t n = len;
|
|
const char32_t *srcBeg = src;
|
|
const char *dstBeg = dst;
|
|
size_t vl8m4 = __riscv_vsetvlmax_e8m4();
|
|
vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
|
|
__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
|
|
|
|
for (size_t vl, vlOut; n > 0;) {
|
|
vl = __riscv_vsetvl_e32m4(n);
|
|
|
|
vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl);
|
|
vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl);
|
|
vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);
|
|
|
|
if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
|
|
vlOut = vl;
|
|
__riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut),
|
|
vlOut);
|
|
n -= vl, src += vl, dst += vlOut;
|
|
continue;
|
|
}
|
|
|
|
vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl);
|
|
|
|
if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
|
|
/* 0: [ aaa|aabbbbbb]
|
|
* 1: [aabbbbbb| ] vsll 8
|
|
* 2: [ | aaaaa] vsrl 6
|
|
* 3: [00111111|00111111]
|
|
* 4: [ bbbbbb|000aaaaa] (1|2)&3
|
|
* 5: [10000000|11000000]
|
|
* 6: [10bbbbbb|110aaaaa] 4|5 */
|
|
vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
|
|
__riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl),
|
|
__riscv_vsrl_vx_u16m2(vn, 6, vl), vl),
|
|
0b0011111100111111, vl);
|
|
vuint16m2_t vout16 =
|
|
__riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
|
|
vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
|
|
|
|
/* Every high byte that is zero should be compressed
|
|
* low bytes should never be compressed, so we set them
|
|
* to all ones, and then create a non-zero bytes mask */
|
|
vbool4_t mcomp =
|
|
__riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
|
|
__riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
|
|
0, vl * 2);
|
|
vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
|
|
|
|
vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
|
|
__riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
|
|
|
|
n -= vl, src += vl, dst += vlOut;
|
|
continue;
|
|
}
|
|
const long idx1 =
|
|
__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
|
|
vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(
|
|
__riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
|
|
const long idx2 = __riscv_vfirst_m_b8(sur, vl);
|
|
if (idx1 >= 0 || idx2 >= 0) {
|
|
if (static_cast<unsigned long>(idx1) <=
|
|
static_cast<unsigned long>(idx2)) {
|
|
return result(error_code::TOO_LARGE, src - srcBeg + idx1);
|
|
} else {
|
|
return result(error_code::SURROGATE, src - srcBeg + idx2);
|
|
}
|
|
}
|
|
|
|
vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl);
|
|
long first = __riscv_vfirst_m_b8(m4, vl);
|
|
size_t tail = vl - first;
|
|
vl = first < 0 ? vl : first;
|
|
|
|
if (vl > 0) { /* 1/2/3 byte utf8 */
|
|
/* vn: [aaaabbbb|bbcccccc]
|
|
* v1: [0bcccccc| ] vsll 8
|
|
* v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000
|
|
* v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000
|
|
* v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000
|
|
* v3: [ |1110aaaa] vsrl 12 | 0b11100000
|
|
* 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
|
|
* 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
|
|
* 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
|
|
* [10cccccc]
|
|
*/
|
|
vuint16m2_t v1, v2, v3, v12;
|
|
v1 = __riscv_vor_vx_u16m2_mu(
|
|
m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
|
|
v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
|
|
|
|
v2 = __riscv_vor_vx_u16m2(
|
|
__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111,
|
|
vl),
|
|
0b10000000, vl);
|
|
v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
|
|
0b01000000, vl);
|
|
v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000,
|
|
vl);
|
|
v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
|
|
|
|
vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
|
|
vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
|
|
vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
|
|
|
|
vbool2_t mcomp = __riscv_vmor_mm_b2(
|
|
m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
|
|
vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
|
|
|
|
vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
|
|
__riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
|
|
|
|
n -= vl, src += vl, dst += vlOut;
|
|
}
|
|
|
|
if (tail)
|
|
while (n) {
|
|
uint32_t word = src[0];
|
|
if (word < 0x10000)
|
|
break;
|
|
if (word > 0x10FFFF)
|
|
return result(error_code::TOO_LARGE, src - srcBeg);
|
|
*dst++ = (uint8_t)((word >> 18) | 0b11110000);
|
|
*dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000);
|
|
*dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000);
|
|
*dst++ = (uint8_t)((word & 0b111111) | 0b10000000);
|
|
++src;
|
|
--n;
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, dst - dstBeg);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *src, size_t len, char *dst) const noexcept {
|
|
result res = convert_utf32_to_utf8_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *src, size_t len, char *dst) const noexcept {
|
|
return convert_utf32_to_utf8(src, len, dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static result
|
|
rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len,
|
|
char16_t *dst) {
|
|
size_t vl8m2 = __riscv_vsetvlmax_e8m2();
|
|
vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
|
|
__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
|
|
const char16_t *dstBeg = dst;
|
|
const char32_t *srcBeg = src;
|
|
for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
|
|
vl = __riscv_vsetvl_e32m4(len);
|
|
vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
|
|
vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
|
|
const long idx1 =
|
|
__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
|
|
const long idx2 = __riscv_vfirst_m_b8(
|
|
__riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
|
|
if (idx1 >= 0 || idx2 >= 0) {
|
|
if (static_cast<unsigned long>(idx1) <=
|
|
static_cast<unsigned long>(idx2)) {
|
|
return result(error_code::TOO_LARGE, src - srcBeg + idx1);
|
|
} else {
|
|
return result(error_code::SURROGATE, src - srcBeg + idx2);
|
|
}
|
|
}
|
|
const long idx =
|
|
__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
|
|
if (idx < 0) {
|
|
vlOut = vl;
|
|
vuint16m2_t n =
|
|
simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
|
|
__riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
|
|
continue;
|
|
}
|
|
vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
|
|
}
|
|
return result(error_code::SUCCESS, dst - dstBeg);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *src, size_t len, char16_t *dst) const noexcept {
|
|
result res = convert_utf32_to_utf16le_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *src, size_t len, char16_t *dst) const noexcept {
|
|
result res = convert_utf32_to_utf16be_with_errors(src, len, dst);
|
|
return res.error == error_code::SUCCESS ? res.count : 0;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *src, size_t len, char16_t *dst) const noexcept {
|
|
return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::NONE>(
|
|
src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *src, size_t len, char16_t *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::ZVBB>(
|
|
src, len, dst);
|
|
else
|
|
return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::V>(src, len,
|
|
dst);
|
|
}
|
|
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static size_t
|
|
rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len,
|
|
char16_t *dst) {
|
|
size_t vl8m2 = __riscv_vsetvlmax_e8m2();
|
|
vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
|
|
__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
|
|
char16_t *dstBeg = dst;
|
|
for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
|
|
vl = __riscv_vsetvl_e32m4(len);
|
|
vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
|
|
if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) <
|
|
0) {
|
|
vlOut = vl;
|
|
vuint16m2_t n =
|
|
simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
|
|
__riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
|
|
continue;
|
|
}
|
|
vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
|
|
}
|
|
return dst - dstBeg;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *src, size_t len, char16_t *dst) const noexcept {
|
|
return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::NONE>(src, len,
|
|
dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *src, size_t len, char16_t *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::ZVBB>(src, len,
|
|
dst);
|
|
else
|
|
return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::V>(src, len, dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* end file src/rvv/rvv_utf32_to.inl.cpp */
|
|
/* begin file src/rvv/rvv_utf8_to.inl.cpp */
|
|
#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32)
|
|
template <typename Tdst, simdutf_ByteFlip bflip, bool validate = true>
|
|
simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
|
|
size_t len, Tdst *dst) {
|
|
static_assert(std::is_same<Tdst, uint16_t>() ||
|
|
std::is_same<Tdst, uint32_t>(),
|
|
"invalid type");
|
|
constexpr bool is16 = std::is_same<Tdst, uint16_t>();
|
|
constexpr endianness endian =
|
|
bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
|
|
const auto scalar = [](char const *in, size_t count, Tdst *out) {
|
|
return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count,
|
|
(char16_t *)out)
|
|
: scalar::utf8_to_utf32::convert(in, count, (char32_t *)out);
|
|
};
|
|
|
|
if (len < 32)
|
|
return scalar(src, len, dst);
|
|
|
|
/* validate first three bytes */
|
|
if (validate) {
|
|
size_t idx = 3;
|
|
while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10)
|
|
++idx;
|
|
if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
|
|
return 0;
|
|
}
|
|
|
|
size_t tail = 3;
|
|
size_t n = len - tail;
|
|
Tdst *beg = dst;
|
|
|
|
static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
|
|
static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
|
|
static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
|
|
|
|
const vuint8m1_t err1tbl =
|
|
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
|
|
const vuint8m1_t err2tbl =
|
|
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
|
|
const vuint8m1_t err3tbl =
|
|
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
|
|
|
|
size_t vl8m1 = __riscv_vsetvlmax_e8m1();
|
|
size_t vl8m2 = __riscv_vsetvlmax_e8m2();
|
|
vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
|
|
__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
|
|
|
|
for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
|
|
vl = __riscv_vsetvl_e8m2(n);
|
|
|
|
vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl);
|
|
uint64_t max = __riscv_vmv_x_s_u8m1_u8(
|
|
__riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
|
|
|
|
uint8_t next0 = src[vl + 0];
|
|
uint8_t next1 = src[vl + 1];
|
|
uint8_t next2 = src[vl + 2];
|
|
|
|
/* fast path: ASCII */
|
|
if ((max | next0 | next1 | next2) < 0b10000000) {
|
|
vlOut = vl;
|
|
if (is16)
|
|
__riscv_vse16_v_u16m4(
|
|
(uint16_t *)dst,
|
|
simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut),
|
|
vlOut);
|
|
else
|
|
__riscv_vse32_v_u32m8((uint32_t *)dst,
|
|
__riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
|
|
continue;
|
|
}
|
|
|
|
/* see "Validating UTF-8 In Less Than One Instruction Per Byte"
|
|
* https://arxiv.org/abs/2010.03090 */
|
|
vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
|
|
vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
|
|
vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);
|
|
|
|
if (validate) {
|
|
vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
|
|
vuint8m2_t idx1 = __riscv_vsrl_vx_u8m2(v2, 4, vl);
|
|
vuint8m2_t idx3 = __riscv_vsrl_vx_u8m2(v3, 4, vl);
|
|
|
|
vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
|
|
vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
|
|
vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
|
|
vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(
|
|
__riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
|
|
|
|
vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl);
|
|
vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl);
|
|
vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
|
|
vbool4_t err34 =
|
|
__riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
|
|
vbool4_t errm =
|
|
__riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
|
|
if (__riscv_vfirst_m_b4(errm, vl) >= 0)
|
|
return 0;
|
|
}
|
|
|
|
/* decoding */
|
|
|
|
/* mask of non continuation bytes */
|
|
vbool4_t m =
|
|
__riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
|
|
vlOut = __riscv_vcpop_m_b4(m, vl);
|
|
|
|
/* extract first and second bytes */
|
|
vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
|
|
vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
|
|
|
|
/* fast path: one and two byte */
|
|
if (max < 0b11100000) {
|
|
b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
|
|
|
|
vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
|
|
b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
|
|
|
|
vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
|
|
b1,
|
|
__riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
|
|
vlOut),
|
|
vlOut);
|
|
b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
|
|
if (is16)
|
|
__riscv_vse16_v_u16m4((uint16_t *)dst,
|
|
simdutf_byteflip<bflip>(b12, vlOut), vlOut);
|
|
else
|
|
__riscv_vse32_v_u32m8((uint32_t *)dst,
|
|
__riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
|
|
continue;
|
|
}
|
|
|
|
/* fast path: one, two and three byte */
|
|
if (max < 0b11110000) {
|
|
vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
|
|
|
|
b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
|
|
b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
|
|
|
|
vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
|
|
vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
|
|
|
|
vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
|
|
b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
|
|
|
|
vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
|
|
b1,
|
|
__riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
|
|
vlOut),
|
|
vlOut);
|
|
b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
|
|
vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(
|
|
m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
|
|
if (is16)
|
|
__riscv_vse16_v_u16m4((uint16_t *)dst,
|
|
simdutf_byteflip<bflip>(b123, vlOut), vlOut);
|
|
else
|
|
__riscv_vse32_v_u32m8((uint32_t *)dst,
|
|
__riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
|
|
continue;
|
|
}
|
|
|
|
/* extract third and fourth bytes */
|
|
vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
|
|
vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
|
|
|
|
/* remove prefix from leading bytes
|
|
*
|
|
* We could also use vrgather here, but it increases register pressure,
|
|
* and its performance varies widely on current platforms. It might be
|
|
* worth reconsidering, though, once there is more hardware available.
|
|
* Same goes for the __riscv_vsrl_vv_u32m4 correction step.
|
|
*
|
|
* We shift left and then right by the number of bytes in the prefix,
|
|
* which can be calculated as follows:
|
|
* x max(x-10, 0)
|
|
* 0xxx -> 0000-0111 -> sift by 0 or 1 -> 0
|
|
* 10xx -> 1000-1011 -> don't care
|
|
* 110x -> 1100,1101 -> sift by 3 -> 2,3
|
|
* 1110 -> 1110 -> sift by 4 -> 4
|
|
* 1111 -> 1111 -> sift by 5 -> 5
|
|
*
|
|
* vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
|
|
* just need to manually detect and handle the one special case:
|
|
*/
|
|
#define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx) \
|
|
vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \
|
|
vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \
|
|
vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \
|
|
vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \
|
|
/* remove prefix from trailing bytes */ \
|
|
c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \
|
|
c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \
|
|
c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \
|
|
vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \
|
|
shift = __riscv_vmerge_vxm_u8m1( \
|
|
__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, \
|
|
__riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \
|
|
c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \
|
|
c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \
|
|
/* unconditionally widen and combine to c1234 */ \
|
|
vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2( \
|
|
__riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut); \
|
|
vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2( \
|
|
__riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut); \
|
|
vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4( \
|
|
__riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut); \
|
|
/* derive required right-shift amount from `shift` to reduce \
|
|
* c1234 to the required number of bytes */ \
|
|
c1234 = __riscv_vsrl_vv_u32m4( \
|
|
c1234, \
|
|
__riscv_vzext_vf4_u32m4( \
|
|
__riscv_vmul_vx_u8m1( \
|
|
__riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut), \
|
|
3, vlOut), \
|
|
6, vlOut), \
|
|
vlOut), \
|
|
vlOut); \
|
|
/* store result in desired format */ \
|
|
if (is16) \
|
|
vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, c1234, vlOut, \
|
|
m4even); \
|
|
else \
|
|
vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut);
|
|
|
|
/* Unrolling this manually reduces register pressure and allows
|
|
* us to terminate early. */
|
|
{
|
|
size_t vlOutm2 = vlOut, vlDst;
|
|
vlOut = __riscv_vsetvl_e8m1(vlOut < vl8m1 ? vlOut : vl8m1);
|
|
SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
|
|
if (vlOutm2 == vlOut) {
|
|
vlOut = vlDst;
|
|
continue;
|
|
}
|
|
|
|
dst += vlDst;
|
|
vlOut = vlOutm2 - vlOut;
|
|
}
|
|
{
|
|
size_t vlDst;
|
|
SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
|
|
vlOut = vlDst;
|
|
}
|
|
|
|
#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
|
|
}
|
|
|
|
/* validate the last character and reparse it + tail */
|
|
if (len > tail) {
|
|
if ((uint8_t(src[0]) >> 6) == 0b10)
|
|
--dst;
|
|
while ((uint8_t(src[0]) >> 6) == 0b10 && tail < len)
|
|
--src, ++tail;
|
|
if (is16) {
|
|
/* go back one more, when on high surrogate */
|
|
if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 &&
|
|
simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
|
|
--dst;
|
|
}
|
|
}
|
|
size_t ret = scalar(src, tail, dst);
|
|
if (ret == 0)
|
|
return 0;
|
|
return (size_t)(dst - beg) + ret;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 ||
|
|
// SIMDUTF_FEATURE_UTF32)
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *src, size_t len, char *dst) const noexcept {
|
|
const char *beg = dst;
|
|
uint8_t last = 0;
|
|
for (size_t vl, vlOut; len > 0;
|
|
len -= vl, src += vl, dst += vlOut, last = src[-1]) {
|
|
vl = __riscv_vsetvl_e8m2(len);
|
|
vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
|
|
// check which bytes are ASCII
|
|
vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
|
|
// count ASCII bytes
|
|
vlOut = __riscv_vcpop_m_b4(ascii, vl);
|
|
// The original code would only enter the next block after this check:
|
|
// vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
|
|
// vlOut = __riscv_vcpop_m_b4(m, vl);
|
|
// if (vlOut != vl || last > 0b01111111) {...}q
|
|
// So that everything is ASCII or continuation bytes, we just proceeded
|
|
// without any processing, going straight to __riscv_vse8_v_u8m2.
|
|
// But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII
|
|
// byte.
|
|
if (vlOut != vl) { // If not pure ASCII
|
|
// Non-ASCII characters
|
|
// We now want to mark the ascii and continuation bytes
|
|
vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
|
|
// We count them, that's our new vlOut (output vector length)
|
|
vlOut = __riscv_vcpop_m_b4(m, vl);
|
|
|
|
vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
|
|
|
|
vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
|
|
vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(
|
|
__riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
|
|
// -62 i 0b11000010, so we check whether any of v0 is too big
|
|
vbool4_t tobig = __riscv_vmand_mm_b4(
|
|
leading0,
|
|
__riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl),
|
|
1, vl),
|
|
vl);
|
|
if (__riscv_vfirst_m_b4(
|
|
__riscv_vmor_mm_b4(
|
|
tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl),
|
|
vl) >= 0)
|
|
return 0;
|
|
|
|
v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
|
|
v1, v1, 0b01000000, vl);
|
|
v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
|
|
} else if (last >= 0b11000000) { // If last byte is a leading byte and we
|
|
// got only ASCII, error!
|
|
return 0;
|
|
}
|
|
__riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
|
|
}
|
|
if (last > 0b10111111)
|
|
return 0;
|
|
return dst - beg;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *src, size_t len, char *dst) const noexcept {
|
|
size_t res = convert_utf8_to_latin1(src, len, dst);
|
|
if (res)
|
|
return result(error_code::SUCCESS, res);
|
|
return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *src, size_t len, char *dst) const noexcept {
|
|
const char *beg = dst;
|
|
uint8_t last = 0;
|
|
for (size_t vl, vlOut; len > 0;
|
|
len -= vl, src += vl, dst += vlOut, last = src[-1]) {
|
|
vl = __riscv_vsetvl_e8m2(len);
|
|
vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
|
|
vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
|
|
vlOut = __riscv_vcpop_m_b4(ascii, vl);
|
|
if (vlOut != vl) { // If not pure ASCII
|
|
vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
|
|
vlOut = __riscv_vcpop_m_b4(m, vl);
|
|
vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
|
|
v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
|
|
v1, v1, 0b01000000, vl);
|
|
v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
|
|
}
|
|
__riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
|
|
}
|
|
return dst - beg;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE>(src, len,
|
|
(uint16_t *)dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB>(
|
|
src, len, (uint16_t *)dst);
|
|
else
|
|
return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V>(src, len,
|
|
(uint16_t *)dst);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
size_t res = convert_utf8_to_utf16le(src, len, dst);
|
|
if (res)
|
|
return result(error_code::SUCCESS, res);
|
|
return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
size_t res = convert_utf8_to_utf16be(src, len, dst);
|
|
if (res)
|
|
return result(error_code::SUCCESS, res);
|
|
return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(src, len,
|
|
dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE, false>(
|
|
src, len, (uint16_t *)dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *src, size_t len, char16_t *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB, false>(
|
|
src, len, (uint16_t *)dst);
|
|
else
|
|
return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V, false>(
|
|
src, len, (uint16_t *)dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *src, size_t len, char32_t *dst) const noexcept {
|
|
return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len,
|
|
(uint32_t *)dst);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *src, size_t len, char32_t *dst) const noexcept {
|
|
size_t res = convert_utf8_to_utf32(src, len, dst);
|
|
if (res)
|
|
return result(error_code::SUCCESS, res);
|
|
return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *src, size_t len, char32_t *dst) const noexcept {
|
|
return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(
|
|
src, len, (uint32_t *)dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* end file src/rvv/rvv_utf8_to.inl.cpp */
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified)
|
|
return bom_encoding;
|
|
// todo: reimplement as a one-pass algorithm.
|
|
int out = 0;
|
|
if (validate_utf8(input, length))
|
|
out |= encoding_type::UTF8;
|
|
if (length % 2 == 0) {
|
|
if (validate_utf16le(reinterpret_cast<const char16_t *>(input), length / 2))
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
if (length % 4 == 0) {
|
|
if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4))
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(input, len,
|
|
output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::BIG>(input, len,
|
|
output);
|
|
}
|
|
|
|
template <simdutf_ByteFlip bflip>
|
|
simdutf_really_inline static void
|
|
rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) {
|
|
for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
|
|
vl = __riscv_vsetvl_e16m8(len);
|
|
vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
|
|
__riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip<bflip>(v, vl), vl);
|
|
}
|
|
}
|
|
|
|
void implementation::change_endianness_utf16(const char16_t *src, size_t len,
|
|
char16_t *dst) const noexcept {
|
|
if (supports_zvbb())
|
|
return rvv_change_endianness_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
|
|
else
|
|
return rvv_change_endianness_utf16<simdutf_ByteFlip::V>(src, len, dst);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
return {SUCCESS, 0};
|
|
}
|
|
result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
auto equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
return {SUCCESS, 0};
|
|
}
|
|
result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
const bool ignore_garbage =
|
|
(options == base64_options::base64_url_accept_garbage) ||
|
|
(options == base64_options::base64_default_accept_garbage);
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
size_t equallocation =
|
|
length; // location of the first padding character if any
|
|
size_t equalsigns = 0;
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
length -= 1;
|
|
equalsigns++;
|
|
while (length > 0 &&
|
|
scalar::base64::is_ascii_white_space(input[length - 1])) {
|
|
length--;
|
|
}
|
|
if (length > 0 && input[length - 1] == '=') {
|
|
equallocation = length - 1;
|
|
equalsigns++;
|
|
length -= 1;
|
|
}
|
|
}
|
|
if (length == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
output, input, length, equalsigns, options, last_chunk_options);
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
return scalar::base64::tail_encode_base64(output, input, length, options);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // namespace rvv
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/rvv/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_RVV
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* end file src/simdutf/rvv/end.h */
|
|
/* end file src/rvv/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
/* begin file src/westmere/implementation.cpp */
|
|
/* begin file src/simdutf/westmere/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "westmere"
|
|
// #define SIMDUTF_IMPLEMENTATION westmere
|
|
#define SIMDUTF_SIMD_HAS_BYTEMASK 1
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_WESTMERE
|
|
#endif
|
|
/* end file src/simdutf/westmere/begin.h */
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
#ifndef SIMDUTF_WESTMERE_H
|
|
#error "westmere.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \
|
|
SIMDUTF_FEATURE_UTF8
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING ||
|
|
// SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_really_inline simd8<bool>
|
|
must_be_2_3_continuation(const simd8<uint8_t> prev2,
|
|
const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte =
|
|
prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte =
|
|
prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
|
|
return simd8<bool>(is_third_byte | is_fourth_byte);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/westmere/internal/loader.cpp */
|
|
namespace internal {
|
|
namespace westmere {
|
|
|
|
/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
|
|
/*
|
|
* reads a vector of uint16 values
|
|
* bits after 11th are ignored
|
|
* first 11 bits are encoded into utf8
|
|
* !important! utf8_output must have at least 16 writable bytes
|
|
*/
|
|
|
|
inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
|
|
const __m128i one_byte_bytemask,
|
|
const uint16_t one_byte_bitmask) {
|
|
// 0b1100_0000_1000_0000
|
|
const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
|
|
// 0b0001_1111_0000_0000
|
|
const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
|
|
// 0b0000_0000_0011_1111
|
|
const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
|
|
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = _mm_slli_epi16(v_u16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = _mm_and_si128(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = _mm_and_si128(v_u16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = _mm_or_si128(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = _mm_or_si128(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
|
|
// - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
utf8_output += row[0];
|
|
}
|
|
|
|
inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
|
|
const __m128i v_0000,
|
|
const __m128i v_ff80) {
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask,
|
|
one_byte_bitmask);
|
|
}
|
|
/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
|
|
|
|
} // namespace westmere
|
|
} // namespace internal
|
|
/* end file src/westmere/internal/loader.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/westmere/sse_utf16fix.cpp */
|
|
/*
|
|
* Process one block of 8 characters. If in_place is false,
|
|
* copy the block from in to out. If there is a sequencing
|
|
* error in the block, overwrite the illsequenced characters
|
|
* with the replacement character. This function reads one
|
|
* character before the beginning of the buffer as a lookback.
|
|
* If that character is illsequenced, it too is overwritten.
|
|
*/
|
|
template <endianness big_endian, bool in_place>
|
|
simdutf_really_inline void utf16fix_block_sse(char16_t *out,
|
|
const char16_t *in) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
auto swap_if_needed = [](uint16_t c) -> uint16_t {
|
|
return !simdutf::match_system(big_endian) ? scalar::u16_swap_bytes(c) : c;
|
|
};
|
|
|
|
__m128i lookback, block, lb_masked, block_masked, lb_is_high, block_is_low;
|
|
__m128i illseq, lb_illseq, block_illseq;
|
|
|
|
lookback = _mm_loadu_si128((const __m128i *)(in - 1));
|
|
block = _mm_loadu_si128((const __m128i *)in);
|
|
lb_masked = _mm_and_si128(lookback, _mm_set1_epi16(swap_if_needed(0xfc00U)));
|
|
block_masked = _mm_and_si128(block, _mm_set1_epi16(swap_if_needed(0xfc00U)));
|
|
lb_is_high =
|
|
_mm_cmpeq_epi16(lb_masked, _mm_set1_epi16(swap_if_needed(0xd800U)));
|
|
block_is_low =
|
|
_mm_cmpeq_epi16(block_masked, _mm_set1_epi16(swap_if_needed(0xdc00U)));
|
|
|
|
illseq = _mm_xor_si128(lb_is_high, block_is_low);
|
|
if (_mm_movemask_epi8(illseq) != 0) {
|
|
int lb;
|
|
|
|
/* compute the cause of the illegal sequencing */
|
|
lb_illseq = _mm_andnot_si128(block_is_low, lb_is_high);
|
|
block_illseq = _mm_or_si128(_mm_andnot_si128(lb_is_high, block_is_low),
|
|
_mm_bsrli_si128(lb_illseq, 2));
|
|
|
|
/* fix illegal sequencing in the lookback */
|
|
lb = _mm_cvtsi128_si32(lb_illseq);
|
|
lb = (lb & replacement) | (~lb & out[-1]);
|
|
out[-1] = char16_t(lb);
|
|
/* fix illegal sequencing in the main block */
|
|
block =
|
|
_mm_or_si128(_mm_andnot_si128(block_illseq, block),
|
|
_mm_and_si128(block_illseq, _mm_set1_epi16(replacement)));
|
|
_mm_storeu_si128((__m128i *)out, block);
|
|
} else if (!in_place) {
|
|
_mm_storeu_si128((__m128i *)out, block);
|
|
}
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
void utf16fix_sse(const char16_t *in, size_t n, char16_t *out) {
|
|
const char16_t replacement = scalar::utf16::replacement<big_endian>();
|
|
size_t i;
|
|
if (n < 9) {
|
|
scalar::utf16::to_well_formed_utf16<big_endian>(in, n, out);
|
|
return;
|
|
}
|
|
|
|
out[0] =
|
|
scalar::utf16::is_low_surrogate<big_endian>(in[0]) ? replacement : in[0];
|
|
|
|
/* duplicate code to have the compiler specialise utf16fix_block() */
|
|
if (in == out) {
|
|
for (i = 1; i + 8 < n; i += 8) {
|
|
utf16fix_block_sse<big_endian, true>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block_sse<big_endian, true>(out + n - 8, in + n - 8);
|
|
} else {
|
|
for (i = 1; i + 8 < n; i += 8) {
|
|
utf16fix_block_sse<big_endian, false>(out + i, in + i);
|
|
}
|
|
|
|
utf16fix_block_sse<big_endian, false>(out + n - 8, in + n - 8);
|
|
}
|
|
|
|
out[n - 1] = scalar::utf16::is_high_surrogate<big_endian>(out[n - 1])
|
|
? replacement
|
|
: out[n - 1];
|
|
}
|
|
/* end file src/westmere/sse_utf16fix.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/westmere/sse_validate_utf16.cpp */
|
|
template <endianness big_endian>
|
|
simd8<uint8_t> utf16_gather_high_bytes(const simd16<uint16_t> in0,
|
|
const simd16<uint16_t> in1) {
|
|
if (big_endian) {
|
|
// we want lower bytes
|
|
const auto mask = simd16<uint16_t>(0x00ff);
|
|
const auto t0 = in0 & mask;
|
|
const auto t1 = in1 & mask;
|
|
|
|
return simd16<uint16_t>::pack(t0, t1);
|
|
} else {
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
return simd16<uint16_t>::pack(t0, t1);
|
|
}
|
|
}
|
|
/* end file src/westmere/sse_validate_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */
|
|
std::pair<const char *const, char *const>
|
|
sse_convert_latin1_to_utf8(const char *latin_input,
|
|
const size_t latin_input_length, char *utf8_output) {
|
|
const char *end = latin_input + latin_input_length;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
// 0b1000_0000
|
|
const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
|
|
// 0b1111_1111_1000_0000
|
|
const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
|
|
|
|
const __m128i latin_1_half_into_u16_byte_mask =
|
|
_mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5,
|
|
'\x80', 6, '\x80', 7, '\x80');
|
|
|
|
const __m128i latin_2_half_into_u16_byte_mask =
|
|
_mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80',
|
|
13, '\x80', 14, '\x80', 15, '\x80');
|
|
|
|
// each latin1 takes 1-2 utf8 bytes
|
|
// slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
|
|
// adjust the pointer) so the last write can exceed the utf8_output size by
|
|
// 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
|
|
// 8-16 bytes free
|
|
while (end - latin_input >= 16 + 8) {
|
|
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
|
|
__m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
|
|
|
|
if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
|
|
_mm_storeu_si128((__m128i *)utf8_output, v_latin);
|
|
latin_input += 16;
|
|
utf8_output += 16;
|
|
continue;
|
|
}
|
|
|
|
// assuming a/b are bytes and A/B are uint16 of the same value
|
|
// aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
|
|
__m128i v_u16_latin_1_half =
|
|
_mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
|
|
// aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
|
|
__m128i v_u16_latin_2_half =
|
|
_mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
|
|
|
|
internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half,
|
|
utf8_output, v_0000, v_ff80);
|
|
internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half,
|
|
utf8_output, v_0000, v_ff80);
|
|
latin_input += 16;
|
|
}
|
|
|
|
if (end - latin_input >= 16) {
|
|
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
|
|
__m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
|
|
|
|
if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
|
|
_mm_storeu_si128((__m128i *)utf8_output, v_latin);
|
|
latin_input += 16;
|
|
utf8_output += 16;
|
|
} else {
|
|
// assuming a/b are bytes and A/B are uint16 of the same value
|
|
// aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
|
|
__m128i v_u16_latin_1_half =
|
|
_mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
|
|
internal::westmere::write_v_u16_11bits_to_utf8(
|
|
v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
|
|
latin_input += 8;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(latin_input, utf8_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char *, char16_t *>
|
|
sse_convert_latin1_to_utf16(const char *latin1_input, size_t len,
|
|
char16_t *utf16_output) {
|
|
size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
|
|
for (size_t i = 0; i < rounded_len; i += 16) {
|
|
// Load 16 Latin1 characters into a 128-bit register
|
|
__m128i in =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&latin1_input[i]));
|
|
__m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
|
|
: _mm_unpacklo_epi8(in, _mm_setzero_si128());
|
|
__m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
|
|
: _mm_unpackhi_epi8(in, _mm_setzero_si128());
|
|
// Zero extend each Latin1 character to 16-bit integers and store the
|
|
// results back to memory
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i]), out1);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i + 8]), out2);
|
|
}
|
|
// return pointers pointing to where we left off
|
|
return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
|
|
}
|
|
/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */
|
|
std::pair<const char *, char32_t *>
|
|
sse_convert_latin1_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char *end = buf + len;
|
|
|
|
while (end - buf >= 16) {
|
|
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
|
|
// Shift input to process next 4 bytes
|
|
__m128i in_shifted1 = _mm_srli_si128(in, 4);
|
|
__m128i in_shifted2 = _mm_srli_si128(in, 8);
|
|
__m128i in_shifted3 = _mm_srli_si128(in, 12);
|
|
|
|
// expand 8-bit to 32-bit unit
|
|
__m128i out1 = _mm_cvtepu8_epi32(in);
|
|
__m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
|
|
__m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
|
|
__m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
|
|
|
|
_mm_storeu_si128((__m128i *)utf32_output, out1);
|
|
_mm_storeu_si128((__m128i *)(utf32_output + 4), out2);
|
|
_mm_storeu_si128((__m128i *)(utf32_output + 8), out3);
|
|
_mm_storeu_si128((__m128i *)(utf32_output + 12), out4);
|
|
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
// Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
|
|
__m128i ascii_first = _mm_cvtepu8_epi16(in);
|
|
__m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
|
|
if (big_endian) {
|
|
ascii_first = _mm_shuffle_epi8(ascii_first, swap);
|
|
ascii_second = _mm_shuffle_epi8(ascii_second, swap);
|
|
}
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8),
|
|
ascii_second);
|
|
utf16_output += 12; // We wrote 12 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
|
|
// UTF-16 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian)
|
|
composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
|
|
// UTF-16 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian)
|
|
composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small
|
|
// lookup table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian)
|
|
composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian)
|
|
composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-code units
|
|
//////////////
|
|
// There might be garbage inputs where a leading byte mascarades as a
|
|
// four-byte leading byte (by being followed by 3 continuation byte), but is
|
|
// not greater than 0xf0. This could trigger a buffer overflow if we only
|
|
// counted leading bytes of the form 0xf0 as generating surrogate pairs,
|
|
// without further UTF-8 validation. Thus we must be careful to ensure that
|
|
// only leading bytes at least as large as 0xf0 generate surrogate pairs. We
|
|
// do as at the cost of an extra mask.
|
|
/////////////
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
// We deliberately carry the leading four bits in highbyte if they are
|
|
// present, we remove them later when computing hightenbits.
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
// When we need to generate a surrogate pair (leading byte > 0xF0), then
|
|
// the corresponding 32-bit value in 'composed' will be greater than
|
|
// > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
|
|
// location of the surrogate pairs.
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
const __m128i composedminus =
|
|
_mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
|
|
const __m128i lowtenbits =
|
|
_mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
|
|
// Notice the 0x3ff mask:
|
|
const __m128i hightenbits =
|
|
_mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
|
|
const __m128i lowtenbitsadd =
|
|
_mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
|
|
const __m128i hightenbitsadd =
|
|
_mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
|
|
const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
|
|
__m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
|
|
uint32_t basic_buffer[4];
|
|
uint32_t basic_buffer_swap[4];
|
|
if (big_endian) {
|
|
_mm_storeu_si128((__m128i *)basic_buffer_swap,
|
|
_mm_shuffle_epi8(composed, swap));
|
|
surrogates = _mm_shuffle_epi8(surrogates, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i *)basic_buffer, composed);
|
|
uint32_t surrogate_buffer[4];
|
|
_mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
|
|
for (size_t i = 0; i < 3; i++) {
|
|
if (basic_buffer[i] > 0x3c00000) {
|
|
utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
|
|
utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
|
|
: uint16_t(basic_buffer[i]);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
|
|
_mm_cvtepu8_epi32(in));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
|
|
_mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8),
|
|
_mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12),
|
|
_mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
|
|
utf32_output += 12; // We wrote 12 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
|
|
// UTF-32 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
|
|
_mm_cvtepu16_epi32(composed));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
|
|
_mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
|
|
utf32_output += 8; // We wrote 32 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
|
|
// UTF-32 code units. There is probably a more efficient sequence, but the
|
|
// following might do.
|
|
const __m128i sh =
|
|
_mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small
|
|
// lookup table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
|
|
_mm_cvtepu16_epi32(composed));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
|
|
_mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
|
|
utf32_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-code units
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 3;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_latin1(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char *&latin1_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask &
|
|
0xfff; // we are only processing 12 bytes in case it is not all ASCII
|
|
if (utf8_end_of_code_point_mask == 0xfff) {
|
|
// We process the data in chunks of 12 bytes.
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
|
|
latin1_output += 12; // We wrote 12 characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
// this indicates an invalid input:
|
|
if (idx >= 64) {
|
|
return consumed;
|
|
}
|
|
// Here we should have (idx < 64), if not, there is a bug in the validation or
|
|
// elsewhere. SIX (6) input code-code units this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. On
|
|
// processors where pdep/pext is fast, we might be able to use a small lookup
|
|
// table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
|
|
// writing 8 bytes even though we only care about the first 6 bytes.
|
|
// performance note: it would be faster to use _mm_storeu_si128, we should
|
|
// investigate.
|
|
_mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
|
|
latin1_output += 6; // We wrote 6 bytes.
|
|
return consumed;
|
|
}
|
|
/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 8) {
|
|
// Load 8 UTF-16 characters into 128-bit SSE register
|
|
__m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
|
|
|
|
if (!match_system(big_endian)) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
__m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
|
|
if (_mm_testz_si128(in, high_byte_mask)) {
|
|
// Pack 16-bit characters into 8-bit and store in latin1_output
|
|
__m128i latin1_packed = _mm_packus_epi16(in, in);
|
|
_mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
|
|
latin1_packed);
|
|
// Adjust pointers for next iteration
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 8) {
|
|
__m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
|
|
|
|
if (!match_system(big_endian)) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
__m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
|
|
if (_mm_testz_si128(in, high_byte_mask)) {
|
|
__m128i latin1_packed = _mm_packus_epi16(in, in);
|
|
_mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
|
|
latin1_packed);
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
// Fallback to scalar code for handling errors
|
|
for (int k = 0; k < 8; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
buf += 8;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it is an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
|
|
|
|
const char16_t *end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
|
|
if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
|
|
__m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
nextin = _mm_shuffle_epi8(nextin, swap);
|
|
}
|
|
if (!_mm_testz_si128(nextin, v_ff80)) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in, in);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
internal::westmere::write_v_u16_11bits_to_utf8(
|
|
in, utf8_output, one_byte_bytemask, one_byte_bitmask);
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask =
|
|
(one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
|
|
if (mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
|
|
15, 13, -1, -1, -1, -1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
|
|
if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
|
|
__m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
nextin = _mm_shuffle_epi8(nextin, swap);
|
|
}
|
|
if (!_mm_testz_si128(nextin, v_ff80)) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in, in);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
internal::westmere::write_v_u16_11bits_to_utf8(
|
|
in, utf8_output, one_byte_bytemask, one_byte_bitmask);
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask =
|
|
(one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
|
|
if (mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
|
|
15, 13, -1, -1, -1, -1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
utf8_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routine should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char32_t *>
|
|
sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char16_t *end = buf + len;
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
|
|
while (end - buf >= 8) {
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: no surrogate pair, extend 16-bit code units to 32-bit code units
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
|
|
_mm_cvtepu16_epi32(in));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
|
|
_mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr, utf32_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t *>
|
|
sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
|
|
while (end - buf >= 8) {
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: no surrogate pair, extend 16-bit code units to 32-bit code units
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
|
|
_mm_cvtepu16_epi32(in));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
|
|
_mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
big_endian ? scalar::u16_swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
utf32_output);
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
|
|
|
|
__m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
|
|
__m128i shufmask =
|
|
_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
|
|
|
|
for (size_t i = 0; i < rounded_len; i += 16) {
|
|
__m128i in1 = _mm_loadu_si128((__m128i *)buf);
|
|
__m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
|
|
__m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
|
|
__m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
|
|
|
|
__m128i check_combined = _mm_or_si128(in1, in2);
|
|
check_combined = _mm_or_si128(check_combined, in3);
|
|
check_combined = _mm_or_si128(check_combined, in4);
|
|
|
|
if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
|
|
return std::make_pair(nullptr, latin1_output);
|
|
}
|
|
__m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
|
|
_mm_shuffle_epi8(in2, shufmask));
|
|
__m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
|
|
_mm_shuffle_epi8(in4, shufmask));
|
|
__m128i pack = _mm_unpacklo_epi64(pack1, pack2);
|
|
_mm_storeu_si128((__m128i *)latin1_output, pack);
|
|
latin1_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *start = buf;
|
|
const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
|
|
|
|
__m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
|
|
__m128i shufmask =
|
|
_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
|
|
|
|
for (size_t i = 0; i < rounded_len; i += 16) {
|
|
__m128i in1 = _mm_loadu_si128((__m128i *)buf);
|
|
__m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
|
|
__m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
|
|
__m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
|
|
|
|
__m128i check_combined = _mm_or_si128(in1, in2);
|
|
check_combined = _mm_or_si128(check_combined, in3);
|
|
check_combined = _mm_or_si128(check_combined, in4);
|
|
|
|
if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
|
|
// Fallback to scalar code for handling errors
|
|
for (int k = 0; k < 16; k++) {
|
|
char32_t codepoint = buf[k];
|
|
if (codepoint <= 0xff) {
|
|
*latin1_output++ = char(codepoint);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
__m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
|
|
_mm_shuffle_epi8(in2, shufmask));
|
|
__m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
|
|
_mm_shuffle_epi8(in4, shufmask));
|
|
__m128i pack = _mm_unpacklo_epi64(pack1, pack2);
|
|
_mm_storeu_si128((__m128i *)latin1_output, pack);
|
|
latin1_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
|
|
const char32_t *end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128(); //__m128 = 128 bits
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000
|
|
// 0000
|
|
const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000
|
|
// 0000
|
|
const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000
|
|
// 0000
|
|
const __m128i v_ffff0000 = _mm_set1_epi32(
|
|
(uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
|
|
const __m128i v_7fffffff = _mm_set1_epi32(
|
|
(uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
|
|
__m128i running_max = _mm_setzero_si128();
|
|
__m128i forbidden_bytemask = _mm_setzero_si128();
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >=
|
|
std::ptrdiff_t(
|
|
16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
|
|
// has 4 bytes or 32 bits, thus buf + 16 *
|
|
// char_32t = 512 bits = 64 bytes
|
|
// We load two 16 bytes registers for a total of 32 bytes or 16 characters.
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
__m128i nextin = _mm_loadu_si128(
|
|
(__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars
|
|
running_max = _mm_max_epu32(
|
|
_mm_max_epu32(in, running_max), // take element-wise max char32_t from
|
|
// in and running_max vector
|
|
nextin); // and take element-wise max element from nextin and
|
|
// running_max vector
|
|
|
|
// Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
|
|
// saturation
|
|
__m128i in_16 = _mm_packus_epi32(
|
|
_mm_and_si128(in, v_7fffffff),
|
|
_mm_and_si128(
|
|
nextin,
|
|
v_7fffffff)); // in this context pack the two __m128 into a single
|
|
// By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure
|
|
// all values are interpreted as non-negative, or specifically, the values
|
|
// are within the range of valid Unicode code points. remember : having
|
|
// leading byte 0 means a positive number by the two complements system.
|
|
// Unicode is well beneath the range where you'll start getting issues so
|
|
// that's OK.
|
|
|
|
// Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
|
|
|
|
// Check for ASCII fast path
|
|
|
|
// ASCII fast path!!!!
|
|
// We eagerly load another 32 bytes, hoping that they will be ASCII too.
|
|
// The intuition is that we try to collect 16 ASCII characters which
|
|
// requires a total of 64 bytes of input. If we fail, we just pass thirdin
|
|
// and fourthin as our new inputs.
|
|
if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
|
|
__m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2);
|
|
__m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3);
|
|
running_max = _mm_max_epu32(
|
|
_mm_max_epu32(thirdin, running_max),
|
|
fourthin); // take the running max of all 4 vectors thus far
|
|
__m128i nextin_16 = _mm_packus_epi32(
|
|
_mm_and_si128(thirdin, v_7fffffff),
|
|
_mm_and_si128(fourthin,
|
|
v_7fffffff)); // pack into 1 vector, now you have two
|
|
if (!_mm_testz_si128(
|
|
nextin_16,
|
|
v_ff80)) { // checks if the second packed vector is ASCII, if not:
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(
|
|
in_16, in_16); // creates two copy of in_16 in 1 vector
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output,
|
|
utf8_packed); // put them into the output
|
|
// 3. adjust pointers
|
|
buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32
|
|
// bits = 256 bits
|
|
utf8_output +=
|
|
8; // same with output, e.g. lift the first two blocks alone.
|
|
// Proceed with next input
|
|
in_16 = nextin_16;
|
|
// We need to update in and nextin because they are used later.
|
|
in = thirdin;
|
|
nextin = fourthin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
// no bits set above 7th bit -- find out all the ASCII characters
|
|
const __m128i one_byte_bytemask =
|
|
_mm_cmpeq_epi16( // this takes four bytes at a time and compares:
|
|
_mm_and_si128(in_16, v_ff80), // the vector that get only the first
|
|
// 9 bits of each 16-bit/2-byte units
|
|
v_0000 //
|
|
); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is
|
|
// of format 0000 0000 0000 0XXX XXXX
|
|
// _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and
|
|
// 0000 0000 0000 0000 if not for each 16-bit/2-byte units
|
|
const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(
|
|
one_byte_bytemask)); // collect the MSB from previous vector and put
|
|
// them into uint16_t mas
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
// case: all code units either produce 1 or 2 UTF-8 bytes (at least one
|
|
// produces 2 bytes)
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m128i v_1f00 =
|
|
_mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
|
|
const __m128i v_003f =
|
|
_mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 =
|
|
_mm_and_si128(t0, v_1f00); // potentital first utf8 byte
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 =
|
|
_mm_and_si128(in_16, v_003f); // potential second utf8 byte
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 =
|
|
_mm_or_si128(t1, t2); // first and second potential utf8 byte together
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = _mm_or_si128(
|
|
t3,
|
|
v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m128i utf8_unpacked =
|
|
_mm_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
|
|
// MSB, a - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 =
|
|
static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 =
|
|
static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
|
|
// Check for overflow in packing
|
|
|
|
const __m128i saturation_bytemask = _mm_cmpeq_epi32(
|
|
_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask =
|
|
static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffff) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask =
|
|
_mm_or_si128(forbidden_bytemask,
|
|
_mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
|
|
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask =
|
|
(one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
|
|
if (mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
|
|
15, 13, -1, -1, -1, -1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
} else {
|
|
// case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
|
|
// wasteful to use scalar code, but being efficient with SIMD in the
|
|
// presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
|
|
if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(
|
|
_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
|
|
char *utf8_output) {
|
|
const char32_t *end = buf + len;
|
|
const char32_t *start = buf;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
|
|
const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
|
|
const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
|
|
const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
|
|
const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
// We load two 16 bytes registers for a total of 32 bytes or 8 characters.
|
|
__m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
__m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
|
|
// Check for too large input
|
|
__m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
|
|
if (static_cast<uint16_t>(_mm_movemask_epi8(
|
|
_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
utf8_output);
|
|
}
|
|
|
|
// Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
|
|
// saturation
|
|
__m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff),
|
|
_mm_and_si128(nextin, v_7fffffff));
|
|
|
|
// Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
|
|
|
|
// Check for ASCII fast path
|
|
if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
continue;
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
// case: all code units either produce 1 or 2 UTF-8 bytes (at least one
|
|
// produces 2 bytes)
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
|
|
const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = _mm_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = _mm_and_si128(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = _mm_and_si128(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = _mm_or_si128(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = _mm_or_si128(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m128i utf8_unpacked =
|
|
_mm_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
|
|
// MSB, a - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 =
|
|
static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 =
|
|
static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
|
|
const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
|
|
// Check for overflow in packing
|
|
const __m128i saturation_bytemask = _mm_cmpeq_epi32(
|
|
_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask =
|
|
static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
|
|
|
|
if (saturation_bitmask == 0xffff) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// Check for illegal surrogate code units
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
const __m128i forbidden_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
utf8_output);
|
|
}
|
|
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
|
|
simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask =
|
|
(one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
|
|
if (mask == 0) {
|
|
// We only have three-byte code units. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
|
|
15, 13, -1, -1, -1, -1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i *)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
} else {
|
|
// case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
|
|
// wasteful to use scalar code, but being efficient with SIMD in the
|
|
// presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k), utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k), utf8_output);
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
|
|
struct expansion_result_t {
|
|
size_t u16count;
|
|
__m128i compressed;
|
|
};
|
|
|
|
// Function sse_expand_surrogate takes four **valid** UTF-32 characters
|
|
// having at least one code-point producing a surrogate pair.
|
|
template <endianness byte_order>
|
|
expansion_result_t sse_expand_surrogate(const __m128i x) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
|
|
const auto in = vector_u32(x);
|
|
|
|
const auto non_surrogate_mask = (in & uint32_t(0xffff0000)) == uint32_t(0);
|
|
const auto mask = (~non_surrogate_mask.to_4bit_bitmask()) & 0xf;
|
|
|
|
const auto t0 = in - uint32_t(0x00010000);
|
|
const auto hi = t0.shr<10>() & uint32_t(0x000003ff);
|
|
const auto lo = t0.shl<16>() & uint32_t(0x03ff0000);
|
|
const auto surrogates = (lo | hi) | uint32_t(0xdc00d800);
|
|
|
|
const auto merged = as_vector_u8(select(non_surrogate_mask, in, surrogates));
|
|
|
|
const auto shuffle = vector_u8::load(
|
|
(byte_order == endianness::LITTLE)
|
|
? tables::utf32_to_utf16::pack_utf32_to_utf16le[mask]
|
|
: tables::utf32_to_utf16::pack_utf32_to_utf16be[mask]);
|
|
|
|
const size_t u16count = (4 + count_ones(mask));
|
|
const auto compressed = shuffle.lookup_16(merged);
|
|
|
|
return {u16count, compressed};
|
|
}
|
|
|
|
// Function `validate_utf32` checks 2 x 4 UTF-32 characters for their validity.
|
|
simdutf_really_inline bool validate_utf32(const __m128i a, const __m128i b) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const auto in0 = vector_u32(a);
|
|
const auto in1 = vector_u32(b);
|
|
|
|
const auto standardmax = vector_u32::splat(0x10ffff);
|
|
const auto offset = vector_u32::splat(0xffff2000);
|
|
const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
|
|
|
|
const auto too_large = max(in0, in1) > standardmax;
|
|
const auto surrogate0 = (in0 + offset) > standardoffsetmax;
|
|
const auto surrogate1 = (in1 + offset) > standardoffsetmax;
|
|
|
|
const auto combined = too_large | surrogate0 | surrogate1;
|
|
return !combined.any();
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t *, char16_t *>
|
|
sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
|
|
const char32_t *end = buf + len;
|
|
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
|
|
__m128i forbidden_bytemask = _mm_setzero_si128();
|
|
|
|
while (end - buf >= 16 + 8) {
|
|
const __m128i *ptr = reinterpret_cast<const __m128i *>(buf);
|
|
const __m128i in0 = _mm_loadu_si128(ptr + 0);
|
|
const __m128i in1 = _mm_loadu_si128(ptr + 1);
|
|
const __m128i in2 = _mm_loadu_si128(ptr + 2);
|
|
const __m128i in3 = _mm_loadu_si128(ptr + 3);
|
|
|
|
const __m128i combined =
|
|
_mm_or_si128(_mm_or_si128(in2, in3), _mm_or_si128(in0, in1));
|
|
if (simdutf_likely(_mm_testz_si128(combined, v_ffff0000))) {
|
|
// No bits set above 16th, directly pack UTF-32 to UTF-16
|
|
__m128i utf16_packed0 = _mm_packus_epi32(in0, in1);
|
|
__m128i utf16_packed1 = _mm_packus_epi32(in2, in3);
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask = _mm_or_si128(
|
|
forbidden_bytemask,
|
|
_mm_or_si128(
|
|
_mm_cmpeq_epi16(_mm_and_si128(utf16_packed0, v_f800), v_d800),
|
|
_mm_cmpeq_epi16(_mm_and_si128(utf16_packed1, v_f800), v_d800)));
|
|
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed0 = _mm_shuffle_epi8(utf16_packed0, swap);
|
|
utf16_packed1 = _mm_shuffle_epi8(utf16_packed1, swap);
|
|
}
|
|
|
|
_mm_storeu_si128((__m128i *)utf16_output + 0, utf16_packed0);
|
|
_mm_storeu_si128((__m128i *)utf16_output + 1, utf16_packed1);
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
} else {
|
|
if (!validate_utf32(in0, in1) || !validate_utf32(in2, in3)) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
|
|
const auto ret0 = sse_expand_surrogate<big_endian>(in0);
|
|
_mm_storeu_si128((__m128i *)utf16_output, ret0.compressed);
|
|
utf16_output += ret0.u16count;
|
|
|
|
const auto ret1 = sse_expand_surrogate<big_endian>(in1);
|
|
_mm_storeu_si128((__m128i *)utf16_output, ret1.compressed);
|
|
utf16_output += ret1.u16count;
|
|
|
|
const auto ret2 = sse_expand_surrogate<big_endian>(in2);
|
|
_mm_storeu_si128((__m128i *)utf16_output, ret2.compressed);
|
|
utf16_output += ret2.u16count;
|
|
|
|
const auto ret3 = sse_expand_surrogate<big_endian>(in3);
|
|
_mm_storeu_si128((__m128i *)utf16_output, ret3.compressed);
|
|
utf16_output += ret3.u16count;
|
|
|
|
buf += 16;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(nullptr, utf16_output);
|
|
}
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t *>
|
|
sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
|
|
|
|
while (end - buf >= 8) {
|
|
const __m128i in = _mm_loadu_si128((__m128i *)buf);
|
|
const __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
|
|
|
|
const __m128i combined = _mm_or_si128(in, nextin);
|
|
if (simdutf_likely(_mm_testz_si128(combined, v_ffff0000))) {
|
|
// No bits set above 16th, directly pack UTF-32 to UTF-16
|
|
__m128i utf16_packed = _mm_packus_epi32(in, nextin);
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
const __m128i forbidden_bytemask =
|
|
_mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
utf16_output);
|
|
}
|
|
|
|
if (big_endian) {
|
|
const __m128i swap =
|
|
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
|
|
_mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k), utf16_output);
|
|
}
|
|
*utf16_output++ =
|
|
big_endian
|
|
? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k), utf16_output);
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate =
|
|
uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate =
|
|
uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/westmere/sse_base64.cpp */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
|
|
// --- encoding ----------------------------------------------------
|
|
|
|
template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
|
|
// credit: Wojciech Muła
|
|
// reduce 0..51 -> 0
|
|
// 52..61 -> 1 .. 10
|
|
// 62 -> 11
|
|
// 63 -> 12
|
|
__m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));
|
|
|
|
// distinguish between ranges 0..25 and 26..51:
|
|
// 0 .. 25 -> remains 0
|
|
// 26 .. 51 -> becomes 13
|
|
const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
|
|
result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
|
|
|
|
__m128i shift_LUT;
|
|
if (base64_url) {
|
|
shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
|
|
} else {
|
|
shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
|
|
'0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
|
|
}
|
|
|
|
// read shift
|
|
result = _mm_shuffle_epi8(shift_LUT, result);
|
|
|
|
return _mm_add_epi8(result, input);
|
|
}
|
|
|
|
template <bool isbase64url>
|
|
size_t encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
// credit: Wojciech Muła
|
|
// SSE (lookup: pshufb improved unrolled)
|
|
const uint8_t *input = (const uint8_t *)src;
|
|
|
|
uint8_t *out = (uint8_t *)dst;
|
|
const __m128i shuf =
|
|
_mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
|
|
|
|
size_t i = 0;
|
|
for (; i + 52 <= srclen; i += 48) {
|
|
__m128i in0 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
|
|
__m128i in1 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
|
|
__m128i in2 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
|
|
__m128i in3 = _mm_loadu_si128(
|
|
reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
|
|
|
|
in0 = _mm_shuffle_epi8(in0, shuf);
|
|
in1 = _mm_shuffle_epi8(in1, shuf);
|
|
in2 = _mm_shuffle_epi8(in2, shuf);
|
|
in3 = _mm_shuffle_epi8(in3, shuf);
|
|
|
|
const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
|
|
const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
|
|
const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
|
|
const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));
|
|
|
|
const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
|
|
const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
|
|
const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
|
|
const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));
|
|
|
|
const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
|
|
const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
|
|
const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
|
|
const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));
|
|
|
|
const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
|
|
const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
|
|
const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
|
|
const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));
|
|
|
|
const __m128i input0 = _mm_or_si128(t1_0, t3_0);
|
|
const __m128i input1 = _mm_or_si128(t1_1, t3_1);
|
|
const __m128i input2 = _mm_or_si128(t1_2, t3_2);
|
|
const __m128i input3 = _mm_or_si128(t1_3, t3_3);
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input0));
|
|
out += 16;
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input1));
|
|
out += 16;
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input2));
|
|
out += 16;
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(input3));
|
|
out += 16;
|
|
}
|
|
for (; i + 16 <= srclen; i += 12) {
|
|
|
|
__m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
|
|
|
|
// bytes from groups A, B and C are needed in separate 32-bit lanes
|
|
// in = [DDDD|CCCC|BBBB|AAAA]
|
|
//
|
|
// an input triplet has layout
|
|
// [????????|ccdddddd|bbbbcccc|aaaaaabb]
|
|
// byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next
|
|
// triplet
|
|
//
|
|
// shuffling changes the order of bytes: 1, 0, 2, 1
|
|
// [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
|
|
// ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
|
|
// processed bits
|
|
in = _mm_shuffle_epi8(in, shuf);
|
|
|
|
// unpacking
|
|
|
|
// t0 = [0000cccc|cc000000|aaaaaa00|00000000]
|
|
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
|
|
// t1 = [00000000|00cccccc|00000000|00aaaaaa]
|
|
// (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
|
|
// multiplication)
|
|
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
|
|
|
|
// t2 = [00000000|00dddddd|000000bb|bbbb0000]
|
|
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
|
|
// t3 = [00dddddd|00000000|00bbbbbb|00000000](
|
|
// (d * (1 << 8), b * (1 << 4))
|
|
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
|
|
|
|
// res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
|
|
const __m128i indices = _mm_or_si128(t1, t3);
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),
|
|
lookup_pshufb_improved<isbase64url>(indices));
|
|
out += 16;
|
|
}
|
|
|
|
return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
|
|
srclen - i, options);
|
|
}
|
|
|
|
// --- decoding -----------------------------------------------
|
|
|
|
static simdutf_really_inline void compress(__m128i data, uint16_t mask,
|
|
char *output) {
|
|
if (mask == 0) {
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
|
|
return;
|
|
}
|
|
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
|
|
__m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
|
|
tables::base64::thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = _mm_shuffle_epi8(data, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = tables::base64::BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
|
|
tables::base64::pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = _mm_shuffle_epi8(pruned, compactmask);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
|
|
}
|
|
|
|
static simdutf_really_inline void base64_decode(char *out, __m128i str) {
|
|
// credit: aqrit
|
|
|
|
const __m128i pack_shuffle =
|
|
_mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
|
|
|
|
const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
|
|
const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
|
|
const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
|
|
// Store the output:
|
|
// this writes 16 bytes, but we only need 12.
|
|
_mm_storeu_si128((__m128i *)out, t2);
|
|
}
|
|
|
|
// decode 64 bytes and output 48 bytes
|
|
static inline void base64_decode_block(char *out, const char *src) {
|
|
base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
|
|
base64_decode(out + 12,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
|
|
base64_decode(out + 24,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
|
|
base64_decode(out + 36,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
|
|
}
|
|
|
|
static inline void base64_decode_block_safe(char *out, const char *src) {
|
|
base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
|
|
base64_decode(out + 12,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
|
|
base64_decode(out + 24,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
|
|
char buffer[16];
|
|
base64_decode(buffer,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
|
|
std::memcpy(out + 36, buffer, 12);
|
|
}
|
|
|
|
// --- decoding - base64 class --------------------------------
|
|
|
|
class block64 {
|
|
__m128i chunks[4];
|
|
|
|
public:
|
|
// The caller of this function is responsible to ensure that there are 64
|
|
// bytes available from reading at src.
|
|
simdutf_really_inline block64(const char *src) {
|
|
chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
|
|
chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
|
|
chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
|
|
chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
|
|
}
|
|
|
|
public:
|
|
// The caller of this function is responsible to ensure that there are 128
|
|
// bytes available from reading at src. The data is read into a block64
|
|
// structure.
|
|
simdutf_really_inline block64(const char16_t *src) {
|
|
const auto m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
|
|
const auto m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
|
|
const auto m3 =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
|
|
const auto m4 =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
|
|
const auto m5 =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
|
|
const auto m6 =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
|
|
const auto m7 =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
|
|
const auto m8 =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
|
|
chunks[0] = _mm_packus_epi16(m1, m2);
|
|
chunks[1] = _mm_packus_epi16(m3, m4);
|
|
chunks[2] = _mm_packus_epi16(m5, m6);
|
|
chunks[3] = _mm_packus_epi16(m7, m8);
|
|
}
|
|
|
|
public:
|
|
simdutf_really_inline void copy_block(char *output) {
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), chunks[0]);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), chunks[1]);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), chunks[2]);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), chunks[3]);
|
|
}
|
|
|
|
public:
|
|
simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
|
|
if (is_power_of_two(mask)) {
|
|
return compress_block_single(mask, output);
|
|
}
|
|
|
|
uint64_t nmask = ~mask;
|
|
compress(chunks[0], uint16_t(mask), output);
|
|
compress(chunks[1], uint16_t(mask >> 16),
|
|
output + count_ones(nmask & 0xFFFF));
|
|
compress(chunks[2], uint16_t(mask >> 32),
|
|
output + count_ones(nmask & 0xFFFFFFFF));
|
|
compress(chunks[3], uint16_t(mask >> 48),
|
|
output + count_ones(nmask & 0xFFFFFFFFFFFFULL));
|
|
return count_ones(nmask);
|
|
}
|
|
|
|
private:
|
|
simdutf_really_inline size_t compress_block_single(uint64_t mask,
|
|
char *output) {
|
|
const size_t pos64 = trailing_zeroes(mask);
|
|
const int8_t pos = pos64 & 0xf;
|
|
switch (pos64 >> 4) {
|
|
case 0b00: {
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(chunks[0], sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 0 * 16), compressed);
|
|
_mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), chunks[1]);
|
|
_mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]);
|
|
_mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]);
|
|
} break;
|
|
case 0b01: {
|
|
_mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]);
|
|
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(chunks[1], sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 1 * 16), compressed);
|
|
_mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]);
|
|
_mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]);
|
|
} break;
|
|
case 0b10: {
|
|
_mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]);
|
|
_mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]);
|
|
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(chunks[2], sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 2 * 16), compressed);
|
|
_mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]);
|
|
} break;
|
|
case 0b11: {
|
|
_mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]);
|
|
_mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]);
|
|
_mm_storeu_si128((__m128i *)(output + 2 * 16), chunks[2]);
|
|
|
|
const __m128i v0 = _mm_set1_epi8(char(pos - 1));
|
|
const __m128i v1 =
|
|
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
|
|
const __m128i sh = _mm_sub_epi8(v1, v2);
|
|
const __m128i compressed = _mm_shuffle_epi8(chunks[3], sh);
|
|
|
|
_mm_storeu_si128((__m128i *)(output + 3 * 16), compressed);
|
|
} break;
|
|
}
|
|
|
|
return 63;
|
|
}
|
|
|
|
public:
|
|
template <bool base64_url, bool ignore_garbage>
|
|
simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
|
|
uint32_t err0 = 0;
|
|
uint32_t err1 = 0;
|
|
uint32_t err2 = 0;
|
|
uint32_t err3 = 0;
|
|
uint64_t m0 = to_base64_mask<base64_url, ignore_garbage>(&chunks[0], &err0);
|
|
uint64_t m1 = to_base64_mask<base64_url, ignore_garbage>(&chunks[1], &err1);
|
|
uint64_t m2 = to_base64_mask<base64_url, ignore_garbage>(&chunks[2], &err2);
|
|
uint64_t m3 = to_base64_mask<base64_url, ignore_garbage>(&chunks[3], &err3);
|
|
if (!ignore_garbage) {
|
|
*error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
|
|
((uint64_t)err3 << 48);
|
|
}
|
|
return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
|
|
}
|
|
|
|
private:
|
|
template <bool base64_url, bool ignore_garbage>
|
|
simdutf_really_inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) {
|
|
const __m128i ascii_space_tbl =
|
|
_mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
|
|
0x0, 0xc, 0xd, 0x0, 0x0);
|
|
// credit: aqrit
|
|
__m128i delta_asso;
|
|
if (base64_url) {
|
|
delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
|
|
} else {
|
|
|
|
delta_asso =
|
|
_mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00,
|
|
0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
|
|
}
|
|
__m128i delta_values;
|
|
if (base64_url) {
|
|
delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
|
|
uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
|
|
0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
|
|
uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
|
|
} else {
|
|
delta_values =
|
|
_mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
|
|
int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
|
|
int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
|
|
int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
|
|
}
|
|
__m128i check_asso;
|
|
if (base64_url) {
|
|
check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
|
|
0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
|
|
} else {
|
|
check_asso =
|
|
_mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
|
|
}
|
|
__m128i check_values;
|
|
if (base64_url) {
|
|
check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
|
|
uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
|
|
uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5),
|
|
uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
|
|
uint8_t(0x80), 0x0, uint8_t(0x80));
|
|
} else {
|
|
check_values =
|
|
_mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
|
|
int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
|
|
int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
|
|
int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
|
|
}
|
|
const __m128i shifted = _mm_srli_epi32(*src, 3);
|
|
|
|
const __m128i delta_hash =
|
|
_mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
|
|
const __m128i check_hash =
|
|
_mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);
|
|
|
|
const __m128i out =
|
|
_mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
|
|
const __m128i chk =
|
|
_mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
|
|
const int mask = _mm_movemask_epi8(chk);
|
|
if (!ignore_garbage && mask) {
|
|
__m128i ascii_space =
|
|
_mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
|
|
*error = (mask ^ _mm_movemask_epi8(ascii_space));
|
|
}
|
|
*src = out;
|
|
return (uint16_t)mask;
|
|
}
|
|
|
|
public:
|
|
simdutf_really_inline void base64_decode_block(char *out) {
|
|
base64_decode(out, chunks[0]);
|
|
base64_decode(out + 12, chunks[1]);
|
|
base64_decode(out + 24, chunks[2]);
|
|
base64_decode(out + 36, chunks[3]);
|
|
}
|
|
|
|
public:
|
|
simdutf_really_inline void base64_decode_block_safe(char *out) {
|
|
base64_decode(out, chunks[0]);
|
|
base64_decode(out + 12, chunks[1]);
|
|
base64_decode(out + 24, chunks[2]);
|
|
char buffer[16];
|
|
base64_decode(buffer, chunks[3]);
|
|
std::memcpy(out + 36, buffer, 12);
|
|
}
|
|
};
|
|
/* end file src/westmere/sse_base64.cpp */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with
|
|
// spaces
|
|
template <size_t STEP_SIZE> struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0
|
|
* (in which case this function fills the buffer with spaces and returns 0. In
|
|
* particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
|
|
* block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text_64(const uint8_t *text) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t *>(buf));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') {
|
|
buf[i] = '_';
|
|
}
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char *format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
|
|
for (size_t i = 0; i < 64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline
|
|
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
|
|
: buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
|
|
idx{0} {}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
|
|
return idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *
|
|
buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t
|
|
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if (len == idx) {
|
|
return 0;
|
|
} // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20,
|
|
STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
|
|
// to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the
|
|
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they
|
|
// ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
0b11110000u - 1,
|
|
0b11100000u - 1,
|
|
0b11000000u - 1};
|
|
const simd8<uint8_t> max_value(
|
|
&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast
|
|
// path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is
|
|
// too short or a byte value too large in the last bytes: check_special_cases
|
|
// only checks for bytes too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an
|
|
// ASCII block can't possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
|
|
if (simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete =
|
|
is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template <class checker>
|
|
bool generic_validate_utf8(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char *input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template <class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char *input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/generic/ascii_validation.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace ascii_validation {
|
|
|
|
bool generic_validate_ascii(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
} // namespace ascii_validation
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/ascii_validation.h */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
// transcoding from UTF-8 to UTF-16
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char16_t *utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the
|
|
// generic directory.
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the
|
|
// mask far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow
|
|
// path. Anything that is not a continuation mask is a 'leading byte',
|
|
// that is, the start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end*
|
|
// of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
input + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
|
|
input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(
|
|
in + pos, size - pos, utf16_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 2;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
|
|
size_t iterations = 0;
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + N <= size; pos += N) {
|
|
const auto input =
|
|
vector_i8::load(reinterpret_cast<const int8_t *>(in + pos));
|
|
|
|
const auto continuation = input > int8_t(-65);
|
|
const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240);
|
|
|
|
local -= vector_u8(continuation);
|
|
local -= vector_u8(utf_4bytes);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char32_t *utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while (pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
input + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
|
|
utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // we have an error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
/* begin file src/generic/utf32.h */
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
template <typename T> T min(T a, T b) { return a <= b ? a : b; }
|
|
|
|
simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
|
|
size_t length) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const char32_t *start = input;
|
|
|
|
// we add up to three ones in a single iteration (see the vectorized loop in
|
|
// section #2 below)
|
|
const size_t max_increment = 3;
|
|
|
|
const size_t N = vector_u32::ELEMENTS;
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
const auto v_0000007f = vector_u32::splat(0x0000007f);
|
|
const auto v_000007ff = vector_u32::splat(0x000007ff);
|
|
const auto v_0000ffff = vector_u32::splat(0x0000ffff);
|
|
#else
|
|
const auto v_ffffff80 = vector_u32::splat(0xffffff80);
|
|
const auto v_fffff800 = vector_u32::splat(0xfffff800);
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
const auto one = vector_u32::splat(1);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
size_t counter = 0;
|
|
|
|
// 1. vectorized loop unrolled 4 times
|
|
{
|
|
// we use vector of uint32 counters, this is why this limit is used
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / (max_increment * 4);
|
|
size_t blocks = length / (N * 4);
|
|
length -= blocks * (N * 4);
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
simd32<uint32_t> acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in0 = vector_u32(input + 0 * N);
|
|
const auto in1 = vector_u32(input + 1 * N);
|
|
const auto in2 = vector_u32(input + 2 * N);
|
|
const auto in3 = vector_u32(input + 3 * N);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in0 > v_0000007f);
|
|
acc -= as_vector_u32(in1 > v_0000007f);
|
|
acc -= as_vector_u32(in2 > v_0000007f);
|
|
acc -= as_vector_u32(in3 > v_0000007f);
|
|
|
|
acc -= as_vector_u32(in0 > v_000007ff);
|
|
acc -= as_vector_u32(in1 > v_000007ff);
|
|
acc -= as_vector_u32(in2 > v_000007ff);
|
|
acc -= as_vector_u32(in3 > v_000007ff);
|
|
|
|
acc -= as_vector_u32(in0 > v_0000ffff);
|
|
acc -= as_vector_u32(in1 > v_0000ffff);
|
|
acc -= as_vector_u32(in2 > v_0000ffff);
|
|
acc -= as_vector_u32(in3 > v_0000ffff);
|
|
#else
|
|
acc += min(one, in0 & v_ffffff80);
|
|
acc += min(one, in1 & v_ffffff80);
|
|
acc += min(one, in2 & v_ffffff80);
|
|
acc += min(one, in3 & v_ffffff80);
|
|
|
|
acc += min(one, in0 & v_fffff800);
|
|
acc += min(one, in1 & v_fffff800);
|
|
acc += min(one, in2 & v_fffff800);
|
|
acc += min(one, in3 & v_fffff800);
|
|
|
|
acc += min(one, in0 & v_ffff0000);
|
|
acc += min(one, in1 & v_ffff0000);
|
|
acc += min(one, in2 & v_ffff0000);
|
|
acc += min(one, in3 & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += 4 * N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
// 2. vectorized loop for tail
|
|
{
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / max_increment;
|
|
size_t blocks = length / N;
|
|
length -= blocks * N;
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
auto acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in = vector_u32(input);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in > v_0000007f);
|
|
acc -= as_vector_u32(in > v_000007ff);
|
|
acc -= as_vector_u32(in > v_0000ffff);
|
|
#else
|
|
acc += min(one, in & v_ffffff80);
|
|
acc += min(one, in & v_fffff800);
|
|
acc += min(one, in & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
if (consumed != 0) {
|
|
// We don't count 0th bytes in the vectorized loops above, this
|
|
// is why we need to count them in the end.
|
|
counter += consumed;
|
|
}
|
|
|
|
return counter + scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/generic/utf8.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.gt(-65);
|
|
count += count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 4;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
size_t iterations = 0;
|
|
for (; pos + 4 * N <= size; pos += 4 * N) {
|
|
const auto input0 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
|
|
const auto input1 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
|
|
const auto input2 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
|
|
const auto input3 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
|
|
const auto mask0 = input0 > int8_t(-65);
|
|
const auto mask1 = input1 > int8_t(-65);
|
|
const auto mask2 = input2 > int8_t(-65);
|
|
const auto mask3 = input3 > int8_t(-65);
|
|
|
|
local -= vector_u8(mask0);
|
|
local -= vector_u8(mask1);
|
|
local -= vector_u8(mask2);
|
|
local -= vector_u8(mask3);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
#endif // SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count +
|
|
scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos < size / 32 * 32; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input.swap_bytes();
|
|
}
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
|
|
ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void
|
|
change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos < size / 32 * 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
|
|
using vector_u16 = simd16<uint16_t>;
|
|
constexpr size_t N = vector_u16::ELEMENTS;
|
|
|
|
const auto one = vector_u16::splat(1);
|
|
|
|
auto v_count = vector_u16::zero();
|
|
|
|
// each char16 yields at least one byte
|
|
size_t count = size / N * N;
|
|
|
|
// in a single iteration the increment is 0, 1 or 2, despite we have
|
|
// three additions
|
|
constexpr size_t max_iterations = 65535 / 2;
|
|
size_t iteration = max_iterations;
|
|
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vector_u16::load(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input = input.swap_bytes();
|
|
}
|
|
// 0xd800 .. 0xdbff - low surrogate
|
|
// 0xdc00 .. 0xdfff - high surrogate
|
|
const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
|
|
|
|
// c0 - chars that yield 2- or 3-byte UTF-8 codes
|
|
const auto c0 = min(input & uint16_t(0xff80), one);
|
|
|
|
// c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
|
|
const auto c1 = min(input & uint16_t(0xf800), one);
|
|
|
|
/*
|
|
Explanation how the counting works.
|
|
|
|
In the case of a non-surrogate character we count:
|
|
* always 1 -- see how `count` is initialized above;
|
|
* c0 = 1 if the current char yields 2 or 3 bytes;
|
|
* c1 = 1 if the current char yields 3 bytes.
|
|
|
|
Thus, we always have correct count for the current char:
|
|
from 1, 2 or 3 bytes.
|
|
|
|
A trickier part is how we count surrogate pairs. Whether
|
|
we encounter a surrogate (low or high), we count it as
|
|
3 chars and then minus 1 (`is_surrogate` is -1 or 0).
|
|
Each surrogate char yields 2. A surrogate pair, that
|
|
is a low surrogate followed by a high one, yields
|
|
the expected 4 bytes.
|
|
|
|
It also correctly handles cases when low surrogate is
|
|
processed by the this loop, but high surrogate is counted
|
|
by the scalar procedure. The scalar procedure uses exactly
|
|
the described approach, thanks to that for valid UTF-16
|
|
strings it always count correctly.
|
|
*/
|
|
v_count += c0;
|
|
v_count += c1;
|
|
v_count += vector_u16(is_surrogate);
|
|
|
|
iteration -= 1;
|
|
if (iteration == 0) {
|
|
count += v_count.sum();
|
|
v_count = vector_u16::zero();
|
|
iteration = max_iterations;
|
|
}
|
|
}
|
|
|
|
if (iteration > 0) {
|
|
count += v_count.sum();
|
|
}
|
|
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/validate_utf16.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf16 {
|
|
/*
|
|
UTF-16 validation
|
|
--------------------------------------------------
|
|
|
|
In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
|
|
|
|
In a vectorized algorithm we want to examine the most significant
|
|
nibble in order to select a fast path. If none of highest nibbles
|
|
are 0xD (13), than we are sure that UTF-16 chunk in a vector
|
|
register is valid.
|
|
|
|
Let us analyze what we need to check if the nibble is 0xD. The
|
|
value of the preceding nibble determines what we have:
|
|
|
|
0xd000 .. 0xd7ff - a valid word
|
|
0xd800 .. 0xdbff - low surrogate
|
|
0xdc00 .. 0xdfff - high surrogate
|
|
|
|
Other constraints we have to consider:
|
|
- there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
|
|
- there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
|
|
- there must not be sole low surrogate nor high surrogate
|
|
|
|
We are going to build three bitmasks based on the 3rd nibble:
|
|
- V = valid word,
|
|
- L = low surrogate (0xd800 .. 0xdbff)
|
|
- H = high surrogate (0xdc00 .. 0xdfff)
|
|
|
|
0 1 2 3 4 5 6 7 <--- word index
|
|
[ V | L | H | L | H | V | V | L ]
|
|
1 0 0 0 0 1 1 0 - V = valid masks
|
|
0 1 0 1 0 0 0 1 - L = low surrogate
|
|
0 0 1 0 1 0 0 0 - H high surrogate
|
|
|
|
|
|
1 0 0 0 0 1 1 0 V = valid masks
|
|
0 1 0 1 0 0 0 0 a = L & (H >> 1)
|
|
0 0 1 0 1 0 0 0 b = a << 1
|
|
1 1 1 1 1 1 1 0 c = V | a | b
|
|
^
|
|
the last bit can be zero, we just consume 7
|
|
code units and recheck this word in the next iteration
|
|
*/
|
|
template <endianness big_endian>
|
|
const result validate_utf16_with_errors(const char16_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char16_t *start = input;
|
|
const char16_t *end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 =
|
|
simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
// Function `utf16_gather_high_bytes` consumes two vectors of UTF-16
|
|
// and yields a single vector having only higher bytes of characters.
|
|
const auto in = utf16_gather_high_bytes<big_endian>(in0, in1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0000) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher byte)
|
|
|
|
// V - non-surrogate code units
|
|
// V = not surrogates_wordmask
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(
|
|
L & (H >> 1)); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint16_t b = static_cast<uint16_t>(
|
|
a << 1); // Just mark that the opinput - startite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint16_t c = static_cast<uint16_t>(
|
|
V | a | b); // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single code units or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
// The 15 lower code units of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// For UTF-8 to Latin 1, we can allow any ASCII character, and any
|
|
// continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
|
|
// 0b11000010 and nothing else.
|
|
//
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
constexpr const uint8_t FORBIDDEN = 0xff;
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
FORBIDDEN);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
FORBIDDEN,
|
|
// ____0101 ________
|
|
FORBIDDEN,
|
|
// ____011_ ________
|
|
FORBIDDEN, FORBIDDEN,
|
|
|
|
// ____1___ ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
|
|
// ____1101 ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
this->error |= check_special_cases(input, prev1);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 16; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
latin1_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin1_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t convert_valid(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
|
|
// 16 bytes, and if the data is valid, then it is entirely safe because 16
|
|
// UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
|
|
// assume that you have valid UTF-8 input, so we are going to go back from the
|
|
// end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
|
|
latin1_output);
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
// namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/validate_utf32.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
simdutf_really_inline bool validate(const char32_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
// empty input is valid UTF-32. protect the implementation from
|
|
// handling nullptr
|
|
return true;
|
|
}
|
|
|
|
const char32_t *end = input + size;
|
|
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const auto standardmax = vector_u32::splat(0x10ffff);
|
|
const auto offset = vector_u32::splat(0xffff2000);
|
|
const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
|
|
auto currentmax = vector_u32::zero();
|
|
auto currentoffsetmax = vector_u32::zero();
|
|
|
|
constexpr size_t N = vector_u32::ELEMENTS;
|
|
|
|
while (input + N < end) {
|
|
auto in = vector_u32(input);
|
|
if (!match_system(endianness::BIG)) {
|
|
in.swap_bytes();
|
|
}
|
|
|
|
currentmax = max(currentmax, in);
|
|
currentoffsetmax = max(currentoffsetmax, in + offset);
|
|
input += N;
|
|
}
|
|
|
|
const auto too_large = currentmax > standardmax;
|
|
if (too_large.any()) {
|
|
return false;
|
|
}
|
|
|
|
const auto surrogate = currentoffsetmax > standardoffsetmax;
|
|
if (surrogate.any()) {
|
|
return false;
|
|
}
|
|
|
|
return scalar::utf32::validate(input, end - input);
|
|
}
|
|
|
|
simdutf_really_inline result validate_with_errors(const char32_t *input,
|
|
size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
// empty input is valid UTF-32. protect the implementation from
|
|
// handling nullptr
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char32_t *start = input;
|
|
const char32_t *end = input + size;
|
|
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const auto standardmax = vector_u32::splat(0x10ffff + 1);
|
|
const auto surrogate_mask = vector_u32::splat(0xfffff800);
|
|
const auto surrogate_byte = vector_u32::splat(0x0000d800);
|
|
|
|
constexpr size_t N = vector_u32::ELEMENTS;
|
|
|
|
while (input + N < end) {
|
|
auto in = vector_u32(input);
|
|
if (!match_system(endianness::BIG)) {
|
|
in.swap_bytes();
|
|
}
|
|
|
|
const auto too_large = in >= standardmax;
|
|
const auto surrogate = (in & surrogate_mask) == surrogate_byte;
|
|
|
|
const auto combined = too_large | surrogate;
|
|
if (simdutf_unlikely(combined.any())) {
|
|
const size_t consumed = input - start;
|
|
auto sr = scalar::utf32::validate_with_errors(input, end - input);
|
|
sr.count += consumed;
|
|
|
|
return sr;
|
|
}
|
|
|
|
input += N;
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
auto sr = scalar::utf32::validate_with_errors(input, end - input);
|
|
sr.count += consumed;
|
|
|
|
return sr;
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/generic/base64.h */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace base64 {
|
|
|
|
/*
|
|
The following template function implements API for Base64 decoding.
|
|
|
|
An implementation is responsible for providing the `block64` type and
|
|
associated methods that perform actual conversion. Please refer
|
|
to any vectorized implementation to learn the API of these procedures.
|
|
*/
|
|
template <bool base64_url, bool ignore_garbage, typename chartype>
|
|
full_result
|
|
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
size_t equallocation =
|
|
srclen; // location of the first padding character if any
|
|
// skip trailing spaces
|
|
while (!ignore_garbage && srclen > 0 &&
|
|
scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
size_t equalsigns = 0;
|
|
if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 1;
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 2;
|
|
}
|
|
}
|
|
if (srclen == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
char *end_of_safe_64byte_zone =
|
|
(srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
|
|
|
|
const chartype *const srcinit = src;
|
|
const char *const dstinit = dst;
|
|
const chartype *const srcend = src + srclen;
|
|
|
|
constexpr size_t block_size = 6;
|
|
static_assert(block_size >= 2, "block_size must be at least two");
|
|
char buffer[block_size * 64];
|
|
char *bufferptr = buffer;
|
|
if (srclen >= 64) {
|
|
const chartype *const srcend64 = src + srclen - 64;
|
|
while (src <= srcend64) {
|
|
block64 b(src);
|
|
src += 64;
|
|
uint64_t error = 0;
|
|
const uint64_t badcharmask =
|
|
b.to_base64_mask<base64_url, ignore_garbage>(&error);
|
|
if (!ignore_garbage && error) {
|
|
src -= 64;
|
|
const size_t error_offset = trailing_zeroes(error);
|
|
return {error_code::INVALID_BASE64_CHARACTER,
|
|
size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
|
|
}
|
|
if (badcharmask != 0) {
|
|
bufferptr += b.compress_block(badcharmask, bufferptr);
|
|
} else if (bufferptr != buffer) {
|
|
b.copy_block(bufferptr);
|
|
bufferptr += 64;
|
|
} else {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
b.base64_decode_block_safe(dst);
|
|
} else {
|
|
b.base64_decode_block(dst);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if (bufferptr >= (block_size - 1) * 64 + buffer) {
|
|
for (size_t i = 0; i < (block_size - 2); i++) {
|
|
base64_decode_block(dst, buffer + i * 64);
|
|
dst += 48;
|
|
}
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
|
|
} else {
|
|
base64_decode_block(dst, buffer + (block_size - 2) * 64);
|
|
}
|
|
dst += 48;
|
|
std::memcpy(buffer, buffer + (block_size - 1) * 64,
|
|
64); // 64 might be too much
|
|
bufferptr -= (block_size - 1) * 64;
|
|
}
|
|
}
|
|
}
|
|
|
|
char *buffer_start = buffer;
|
|
// Optimization note: if this is almost full, then it is worth our
|
|
// time, otherwise, we should just decode directly.
|
|
int last_block = (int)((bufferptr - buffer_start) % 64);
|
|
if (last_block != 0 && srcend - src + last_block >= 64) {
|
|
|
|
while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
|
|
uint8_t val = to_base64[uint8_t(*src)];
|
|
*bufferptr = char(val);
|
|
if (!ignore_garbage &&
|
|
(!scalar::base64::is_eight_byte(*src) || val > 64)) {
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
bufferptr += (val <= 63);
|
|
src++;
|
|
}
|
|
}
|
|
|
|
for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer_start);
|
|
} else {
|
|
base64_decode_block(dst, buffer_start);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if ((bufferptr - buffer_start) % 64 != 0) {
|
|
while (buffer_start + 4 < bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
#endif
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
if (buffer_start + 4 <= bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
#if !SIMDUTF_IS_BIG_ENDIAN
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
#endif
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
// we may have 1, 2 or 3 bytes left and we need to decode them so let us
|
|
// backtrack
|
|
int leftover = int(bufferptr - buffer_start);
|
|
while (leftover > 0) {
|
|
if (!ignore_garbage) {
|
|
while (to_base64[uint8_t(*(src - 1))] == 64) {
|
|
src--;
|
|
}
|
|
} else {
|
|
while (to_base64[uint8_t(*(src - 1))] >= 64) {
|
|
src--;
|
|
}
|
|
}
|
|
src--;
|
|
leftover--;
|
|
}
|
|
}
|
|
if (src < srcend + equalsigns) {
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
dst, src, srcend - src, equalsigns, options, last_chunk_options);
|
|
r.input_count += size_t(src - srcinit);
|
|
if (r.error == error_code::INVALID_BASE64_CHARACTER ||
|
|
r.error == error_code::BASE64_EXTRA_BITS) {
|
|
return r;
|
|
} else {
|
|
r.output_count += size_t(dst - dstinit);
|
|
}
|
|
if (!ignore_garbage && last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
r.error = error_code::INVALID_BASE64_CHARACTER;
|
|
r.input_count = equallocation;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if ((size_t(dst - dstinit) % 3 == 0) ||
|
|
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
|
|
} // namespace base64
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/base64.h */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
|
|
int out = 0;
|
|
uint32_t utf16_err = (length % 2);
|
|
uint32_t utf32_err = (length % 4);
|
|
uint32_t ends_with_high = 0;
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
const __m128i standardmax = _mm_set1_epi32(0x10ffff);
|
|
const __m128i offset = _mm_set1_epi32(0xffff2000);
|
|
const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
|
|
__m128i currentmax = _mm_setzero_si128();
|
|
__m128i currentoffsetmax = _mm_setzero_si128();
|
|
|
|
utf8_checker c{};
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
// utf8 checks
|
|
c.check_next_input(in);
|
|
|
|
// utf16le checks
|
|
auto in0 = simd16<uint16_t>(in.chunks[0]);
|
|
auto in1 = simd16<uint16_t>(in.chunks[1]);
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const auto packed1 = simd16<uint16_t>::pack(t0, t1);
|
|
auto in2 = simd16<uint16_t>(in.chunks[2]);
|
|
auto in3 = simd16<uint16_t>(in.chunks[3]);
|
|
const auto t2 = in2.shr<8>();
|
|
const auto t3 = in3.shr<8>();
|
|
const auto packed2 = simd16<uint16_t>::pack(t2, t3);
|
|
|
|
const auto surrogates_wordmask_lo = (packed1 & v_f8) == v_d8;
|
|
const auto surrogates_wordmask_hi = (packed2 & v_f8) == v_d8;
|
|
const uint32_t surrogates_bitmask =
|
|
(surrogates_wordmask_hi.to_bitmask() << 16) |
|
|
surrogates_wordmask_lo.to_bitmask();
|
|
const auto vL_lo = (packed1 & v_fc) == v_dc;
|
|
const auto vL_hi = (packed2 & v_fc) == v_dc;
|
|
const uint32_t L = (vL_hi.to_bitmask() << 16) | vL_lo.to_bitmask();
|
|
const uint32_t H = L ^ surrogates_bitmask;
|
|
utf16_err |= (((H << 1) | ends_with_high) != L);
|
|
ends_with_high = (H & 0x80000000) != 0;
|
|
|
|
// utf32le checks
|
|
currentmax = _mm_max_epu32(in.chunks[0], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[0], offset), currentoffsetmax);
|
|
currentmax = _mm_max_epu32(in.chunks[1], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[1], offset), currentoffsetmax);
|
|
currentmax = _mm_max_epu32(in.chunks[2], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[2], offset), currentoffsetmax);
|
|
currentmax = _mm_max_epu32(in.chunks[3], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[3], offset), currentoffsetmax);
|
|
|
|
reader.advance();
|
|
}
|
|
|
|
uint8_t block[64]{};
|
|
size_t idx = reader.block_index();
|
|
std::memcpy(block, &input[idx], length - idx);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
|
|
// utf16le last block check
|
|
auto in0 = simd16<uint16_t>(in.chunks[0]);
|
|
auto in1 = simd16<uint16_t>(in.chunks[1]);
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const auto packed1 = simd16<uint16_t>::pack(t0, t1);
|
|
auto in2 = simd16<uint16_t>(in.chunks[2]);
|
|
auto in3 = simd16<uint16_t>(in.chunks[3]);
|
|
const auto t2 = in2.shr<8>();
|
|
const auto t3 = in3.shr<8>();
|
|
const auto packed2 = simd16<uint16_t>::pack(t2, t3);
|
|
|
|
const auto surrogates_wordmask_lo = (packed1 & v_f8) == v_d8;
|
|
const auto surrogates_wordmask_hi = (packed2 & v_f8) == v_d8;
|
|
const uint32_t surrogates_bitmask =
|
|
(surrogates_wordmask_hi.to_bitmask() << 16) |
|
|
surrogates_wordmask_lo.to_bitmask();
|
|
const auto vL_lo = (packed1 & v_fc) == v_dc;
|
|
const auto vL_hi = (packed2 & v_fc) == v_dc;
|
|
const uint32_t L = (vL_hi.to_bitmask() << 16) | vL_lo.to_bitmask();
|
|
const uint32_t H = L ^ surrogates_bitmask;
|
|
utf16_err |= (((H << 1) | ends_with_high) != L);
|
|
// this is required to check for last byte ending in high and end of input
|
|
// is reached
|
|
ends_with_high = (H & 0x80000000) != 0;
|
|
utf16_err |= ends_with_high;
|
|
|
|
// utf32le last block check
|
|
currentmax = _mm_max_epu32(in.chunks[0], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[0], offset), currentoffsetmax);
|
|
currentmax = _mm_max_epu32(in.chunks[1], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[1], offset), currentoffsetmax);
|
|
currentmax = _mm_max_epu32(in.chunks[2], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[2], offset), currentoffsetmax);
|
|
currentmax = _mm_max_epu32(in.chunks[3], currentmax);
|
|
currentoffsetmax =
|
|
_mm_max_epu32(_mm_add_epi32(in.chunks[3], offset), currentoffsetmax);
|
|
|
|
reader.advance();
|
|
|
|
c.check_eof();
|
|
bool is_valid_utf8 = !c.errors();
|
|
__m128i is_zero =
|
|
_mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
|
|
utf32_err |= (_mm_test_all_zeros(is_zero, is_zero) == 0);
|
|
|
|
is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
|
|
standardoffsetmax);
|
|
utf32_err |= (_mm_test_all_zeros(is_zero, is_zero) == 0);
|
|
if (is_valid_utf8) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if (utf16_err == 0) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
if (utf32_err == 0) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return westmere::utf8_validation::generic_validate_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return westmere::ascii_validation::generic_validate_ascii(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return westmere::ascii_validation::generic_validate_ascii_with_errors(buf,
|
|
len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid UTF-16. protect the implementation from
|
|
// handling nullptr
|
|
return true;
|
|
}
|
|
const auto res =
|
|
westmere::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count == len)
|
|
return true;
|
|
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid UTF-16. protect the implementation from
|
|
// handling nullptr
|
|
return true;
|
|
}
|
|
const auto res =
|
|
westmere::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count == len)
|
|
return true;
|
|
|
|
return scalar::utf16::validate<endianness::BIG>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
const result res =
|
|
westmere::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
const result scalar_res =
|
|
scalar::utf16::validate_with_errors<endianness::LITTLE>(
|
|
buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
const result res =
|
|
westmere::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
|
|
buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_sse<endianness::LITTLE>(input, len, output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return utf16fix_sse<endianness::BIG>(input, len, output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
return utf32::validate(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
return utf32::validate_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
|
|
std::pair<const char *, char *> ret =
|
|
sse_convert_latin1_to_utf8(buf, len, utf8_output);
|
|
size_t converted_chars = ret.second - utf8_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
sse_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_converted_chars == 0) {
|
|
return 0;
|
|
}
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
sse_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_converted_chars == 0) {
|
|
return 0;
|
|
}
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char *, char32_t *> ret =
|
|
sse_convert_latin1_to_utf32(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t converted_chars = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_converted_chars == 0) {
|
|
return 0;
|
|
}
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *input, size_t size, char32_t *utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
sse_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
sse_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
sse_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
|
|
buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
sse_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: we could provide an optimized function.
|
|
return convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: we could provide an optimized function.
|
|
return convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
|
|
buf, len, utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(
|
|
buf, len, utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
sse_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
// if (ret.first != buf + len) {
|
|
if (ret.first < buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
westmere::sse_convert_utf32_to_latin1_with_errors(buf, len,
|
|
latin1_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: we could provide an optimized function.
|
|
return convert_utf32_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
sse_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
|
|
buf, len, utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(
|
|
buf, len, utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
|
|
buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(
|
|
buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points_bytemask(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::LITTLE>(input,
|
|
length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t len) const noexcept {
|
|
const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
|
|
size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
|
|
size_t i = 0;
|
|
if (answer >= 2048) { // long strings optimization
|
|
__m128i two_64bits = _mm_setzero_si128();
|
|
while (i + sizeof(__m128i) <= len) {
|
|
__m128i runner = _mm_setzero_si128();
|
|
size_t iterations = (len - i) / sizeof(__m128i);
|
|
if (iterations > 255) {
|
|
iterations = 255;
|
|
}
|
|
size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
|
|
for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) {
|
|
__m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
|
|
__m128i input2 =
|
|
_mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
|
|
__m128i input3 =
|
|
_mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i)));
|
|
__m128i input4 =
|
|
_mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i)));
|
|
__m128i input12 =
|
|
_mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1),
|
|
_mm_cmpgt_epi8(_mm_setzero_si128(), input2));
|
|
__m128i input34 =
|
|
_mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3),
|
|
_mm_cmpgt_epi8(_mm_setzero_si128(), input4));
|
|
__m128i input1234 = _mm_add_epi8(input12, input34);
|
|
runner = _mm_sub_epi8(runner, input1234);
|
|
}
|
|
for (; i <= max_i; i += sizeof(__m128i)) {
|
|
__m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
|
|
runner = _mm_sub_epi8(runner,
|
|
_mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
|
|
}
|
|
two_64bits =
|
|
_mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
|
|
}
|
|
answer +=
|
|
_mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1);
|
|
} else if (answer > 0) { // short string optimization
|
|
for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) {
|
|
__m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
|
|
uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
|
|
answer += count_ones(non_ascii);
|
|
latin = _mm_loadu_si128((const __m128i *)(input + i) + 1);
|
|
non_ascii = (uint16_t)_mm_movemask_epi8(latin);
|
|
answer += count_ones(non_ascii);
|
|
}
|
|
for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
|
|
__m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
|
|
uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
|
|
answer += count_ones(non_ascii);
|
|
}
|
|
}
|
|
return answer + scalar::latin1::utf8_length_from_latin1(
|
|
reinterpret_cast<const char *>(str + i), len - i);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8_bytemask(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
const __m128i v_00000000 = _mm_setzero_si128();
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 4 <= length; pos += 4) {
|
|
__m128i in = _mm_loadu_si128((__m128i *)(input + pos));
|
|
const __m128i surrogate_bytemask =
|
|
_mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
|
|
const uint16_t surrogate_bitmask =
|
|
static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
|
|
size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
|
|
count += 4 + surrogate_count;
|
|
}
|
|
return count +
|
|
scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return base64::compress_decode_base64<true, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<true, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return base64::compress_decode_base64<false, true>(
|
|
output, input, length, options, last_chunk_options);
|
|
} else {
|
|
return base64::compress_decode_base64<false, false>(
|
|
output, input, length, options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
if (options & base64_url) {
|
|
return encode_base64<true>(output, input, length, options);
|
|
} else {
|
|
return encode_base64<false>(output, input, length, options);
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/westmere/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
#undef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
/* end file src/simdutf/westmere/end.h */
|
|
/* end file src/westmere/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LSX
|
|
/* begin file src/lsx/implementation.cpp */
|
|
/* begin file src/simdutf/lsx/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "lsx"
|
|
// #define SIMDUTF_IMPLEMENTATION lsx
|
|
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
|
|
/* end file src/simdutf/lsx/begin.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
#ifndef SIMDUTF_LSX_H
|
|
#error "lsx.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
// convert vmskltz/vmskgez/vmsknz to
|
|
// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
|
|
const uint8_t lsx_1_2_utf8_bytes_mask[] = {
|
|
0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84,
|
|
85, 2, 3, 6, 7, 18, 19, 22, 23, 66, 67, 70, 71, 82, 83,
|
|
86, 87, 8, 9, 12, 13, 24, 25, 28, 29, 72, 73, 76, 77, 88,
|
|
89, 92, 93, 10, 11, 14, 15, 26, 27, 30, 31, 74, 75, 78, 79,
|
|
90, 91, 94, 95, 32, 33, 36, 37, 48, 49, 52, 53, 96, 97, 100,
|
|
101, 112, 113, 116, 117, 34, 35, 38, 39, 50, 51, 54, 55, 98, 99,
|
|
102, 103, 114, 115, 118, 119, 40, 41, 44, 45, 56, 57, 60, 61, 104,
|
|
105, 108, 109, 120, 121, 124, 125, 42, 43, 46, 47, 58, 59, 62, 63,
|
|
106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
|
|
149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
|
|
150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
|
|
153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
|
|
154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
|
|
165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
|
|
166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
|
|
169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
|
|
170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
|
|
255};
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
|
|
return __lsx_vshuf4i_b(vec, 0b10110001);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \
|
|
SIMDUTF_FEATURE_UTF8
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
|
|
return input.is_ascii();
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING ||
|
|
// SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_really_inline simd8<bool>
|
|
must_be_2_3_continuation(const simd8<uint8_t> prev2,
|
|
const simd8<uint8_t> prev3) {
|
|
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
|
|
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
|
|
return is_third_byte ^ is_fourth_byte;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32)
|
|
// common functions for utf8 conversions
|
|
simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
|
|
// Low half contains 10bbbbbb|10cccccc
|
|
// High half contains 1110aaaa|1110aaaa
|
|
const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
|
|
const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
|
|
|
|
__m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
|
|
// 1110aaaa => aaaa0000
|
|
__m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
|
|
// 10bbbbbb 10cccccc => 0010bbbb bbcccccc
|
|
__m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
|
|
perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
|
|
// 0010bbbb bbcccccc => aaaabbbb bbcccccc
|
|
composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
|
|
|
|
return composed;
|
|
}
|
|
|
|
simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
|
|
// 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
|
|
__m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
|
|
// 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
|
|
composed = __lsx_vbitsel_v(
|
|
__lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
|
|
__lsx_vsrli_h(composed, 8), /* bbbbbb >> 8 */
|
|
__lsx_vrepli_h(0x3f)); /* 0x003f */
|
|
return composed;
|
|
}
|
|
|
|
simdutf_really_inline __m128i
|
|
convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
|
|
// Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
|
|
// This is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes.
|
|
__m128i sh =
|
|
__lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
|
|
0);
|
|
// Shuffle
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 110aaaaa 10bbbbbb
|
|
__m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000000 00bbbbbb
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 00000aaa aa000000
|
|
const __m128i v1f00 = lsx_splat_u16(0x1f00);
|
|
__m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
|
|
// Combine with a shift right accumulate
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000aaa aabbbbbb
|
|
composed = __lsx_vadd_h(ascii, composed);
|
|
return composed;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 ||
|
|
// SIMDUTF_FEATURE_UTF32)
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/lsx/lsx_validate_utf16.cpp */
|
|
template <endianness big_endian>
|
|
simd8<uint8_t> utf16_gather_high_bytes(const simd16<uint16_t> in0,
|
|
const simd16<uint16_t> in1) {
|
|
if (big_endian) {
|
|
const auto mask = simd16<uint16_t>(0x00ff);
|
|
const auto t0 = in0 & mask;
|
|
const auto t1 = in1 & mask;
|
|
|
|
return simd16<uint16_t>::pack(t0, t1);
|
|
} else {
|
|
return simd16<uint16_t>::pack_shifted_right<8>(in0, in1);
|
|
}
|
|
}
|
|
/* end file src/lsx/lsx_validate_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/lsx/lsx_validate_utf32le.cpp */
|
|
const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) {
|
|
const char32_t *end = input + size;
|
|
|
|
__m128i offset = lsx_splat_u32(0xffff2000);
|
|
__m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff);
|
|
__m128i standardmax = lsx_splat_u32(0x10ffff);
|
|
__m128i currentmax = lsx_splat_u32(0);
|
|
__m128i currentoffsetmax = lsx_splat_u32(0);
|
|
|
|
while (input + 4 < end) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
|
|
currentmax = __lsx_vmax_wu(in, currentmax);
|
|
// 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
|
|
currentoffsetmax =
|
|
__lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
|
|
|
|
input += 4;
|
|
}
|
|
|
|
__m128i is_zero =
|
|
__lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
|
|
if (__lsx_bnz_v(is_zero)) {
|
|
return nullptr;
|
|
}
|
|
|
|
is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
|
|
standardoffsetmax);
|
|
if (__lsx_bnz_v(is_zero)) {
|
|
return nullptr;
|
|
}
|
|
|
|
return input;
|
|
}
|
|
|
|
const result lsx_validate_utf32le_with_errors(const char32_t *input,
|
|
size_t size) {
|
|
const char32_t *start = input;
|
|
const char32_t *end = input + size;
|
|
|
|
__m128i offset = lsx_splat_u32(0xffff2000);
|
|
__m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff);
|
|
__m128i standardmax = lsx_splat_u32(0x10ffff);
|
|
__m128i currentmax = lsx_splat_u32(0);
|
|
__m128i currentoffsetmax = lsx_splat_u32(0);
|
|
|
|
while (input + 4 < end) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
|
|
currentmax = __lsx_vmax_wu(in, currentmax);
|
|
currentoffsetmax =
|
|
__lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
|
|
|
|
__m128i is_zero =
|
|
__lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
|
|
if (__lsx_bnz_v(is_zero)) {
|
|
return result(error_code::TOO_LARGE, input - start);
|
|
}
|
|
|
|
is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
|
|
standardoffsetmax);
|
|
if (__lsx_bnz_v(is_zero)) {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
|
|
input += 4;
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/lsx/lsx_validate_utf32le.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lsx/lsx_convert_latin1_to_utf8.cpp */
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
|
|
std::pair<const char *, char *>
|
|
lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char *end = latin1_input + len;
|
|
|
|
__m128i zero = __lsx_vldi(0);
|
|
// We always write 16 bytes, of which more than the first 8 bytes
|
|
// are valid. A safety margin of 8 is more than sufficient.
|
|
while (end - latin1_input >= 16) {
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
|
|
uint32_t ascii = __lsx_vpickve2gr_hu(__lsx_vmskgez_b(in8), 0);
|
|
if (ascii == 0xffff) { // ASCII fast path!!!!
|
|
__lsx_vst(in8, utf8_output, 0);
|
|
utf8_output += 16;
|
|
latin1_input += 16;
|
|
continue;
|
|
}
|
|
// We just fallback on UTF-16 code. This could be optimized/simplified
|
|
// further.
|
|
__m128i in16 = __lsx_vilvl_b(zero, in8);
|
|
// 1. prepare 2-byte values
|
|
// input 8-bit word : [aabb|bbbb] x 8
|
|
// expected output : [1100|00aa|10bb|bbbb] x 8
|
|
// t0 = [0000|00aa|bbbb|bb00]
|
|
__m128i t0 = __lsx_vslli_h(in16, 2);
|
|
// t1 = [0000|00aa|0000|0000]
|
|
__m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x300));
|
|
// t3 = [0000|00aa|00bb|bbbb]
|
|
__m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f));
|
|
// t4 = [1100|00aa|10bb|bbbb]
|
|
__m128i t3 = __lsx_vor_v(t2, __lsx_vreplgr2vr_h(uint16_t(0xc080)));
|
|
// merge ASCII and 2-byte codewords
|
|
__m128i one_byte_bytemask = __lsx_vsle_hu(in16, __lsx_vrepli_h(0x7F));
|
|
__m128i utf8_unpacked = __lsx_vbitsel_v(t3, in16, one_byte_bytemask);
|
|
|
|
const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lsx_1_2_utf8_bytes_mask[(ascii & 0xff)]][0];
|
|
__m128i shuffle = __lsx_vld(row + 1, 0);
|
|
__m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
|
|
|
|
// store bytes
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// adjust pointers
|
|
latin1_input += 8;
|
|
utf8_output += row[0];
|
|
|
|
} // while
|
|
|
|
return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/lsx/lsx_convert_latin1_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lsx/lsx_convert_latin1_to_utf16.cpp */
|
|
std::pair<const char *, char16_t *>
|
|
lsx_convert_latin1_to_utf16le(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char *end = buf + len;
|
|
|
|
__m128i zero = __lsx_vldi(0);
|
|
while (end - buf >= 16) {
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m128i inlow = __lsx_vilvl_b(zero, in8);
|
|
__m128i inhigh = __lsx_vilvh_b(zero, in8);
|
|
__lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
|
|
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
std::pair<const char *, char16_t *>
|
|
lsx_convert_latin1_to_utf16be(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char *end = buf + len;
|
|
__m128i zero = __lsx_vldi(0);
|
|
while (end - buf >= 16) {
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m128i inlow = __lsx_vilvl_b(in8, zero);
|
|
__m128i inhigh = __lsx_vilvh_b(in8, zero);
|
|
__lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
/* end file src/lsx/lsx_convert_latin1_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lsx/lsx_convert_latin1_to_utf32.cpp */
|
|
std::pair<const char *, char32_t *>
|
|
lsx_convert_latin1_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char *end = buf + len;
|
|
|
|
while (end - buf >= 16) {
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m128i zero = __lsx_vldi(0);
|
|
__m128i in16low = __lsx_vilvl_b(zero, in8);
|
|
__m128i in16high = __lsx_vilvh_b(zero, in8);
|
|
__m128i in32_0 = __lsx_vilvl_h(zero, in16low);
|
|
__m128i in32_1 = __lsx_vilvh_h(zero, in16low);
|
|
__m128i in32_2 = __lsx_vilvl_h(zero, in16high);
|
|
__m128i in32_3 = __lsx_vilvh_h(zero, in16high);
|
|
|
|
__lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
__lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output + 4), 0);
|
|
__lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output + 8), 0);
|
|
__lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output + 12), 0);
|
|
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
/* end file src/lsx/lsx_convert_latin1_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/lsx/lsx_convert_utf8_to_utf16.cpp */
|
|
// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 16, usually 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
|
|
// We first try a few fast paths.
|
|
// The obvious first test is ASCII, which actually consumes the full 16.
|
|
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
|
|
// We process in chunks of 16 bytes
|
|
// The routine in simd.h is reused.
|
|
simd8<int8_t> temp{in};
|
|
temp.store_ascii_as_utf16<big_endian>(utf16_output);
|
|
utf16_output += 16; // We wrote 16 16-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
|
|
uint64_t buffer[2];
|
|
// 3 byte sequences are the next most common, as seen in CJK, which has long
|
|
// sequences of these.
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
|
|
// UTF-16 code units.
|
|
__m128i composed = convert_utf8_3_byte_to_utf16(in);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 4; // We wrote 4 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
|
|
// 2 byte sequences occur in short bursts in languages like Greek and Russian.
|
|
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
|
|
// We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
|
|
// UTF-16 code units.
|
|
__m128i composed = convert_utf8_2_byte_to_utf16(in);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 6; // We wrote 6 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
|
|
/// We do not have a fast path available, or the fast path is unimportant, so
|
|
/// we fallback.
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
const __m128i zero = __lsx_vldi(0);
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// Convert to UTF-16
|
|
__m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
// Store
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 6; // We wrote 6 16-bit characters.
|
|
return consumed;
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
// UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// XXX: depending on the system scalar instructions might be faster.
|
|
// 1 byte: 00000000 00000000 0ccccccc
|
|
// 2 byte: 00000000 110bbbbb 10cccccc
|
|
// 3 byte: 1110aaaa 10bbbbbb 10cccccc
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2 byte: xx0bbbbb x0cccccc
|
|
// 3 byte: xxbbbbbb x0cccccc
|
|
__m128i lowperm = __lsx_vpickev_h(perm, perm);
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 00000000 00000000
|
|
// 3 byte: 00000000 1110aaaa
|
|
__m128i highperm = __lsx_vpickod_h(perm, perm);
|
|
// 3 byte: aaaa0000 00000000
|
|
highperm = __lsx_vslli_h(highperm, 12);
|
|
// ASCII
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2+byte: 00000000 00cccccc
|
|
__m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: xx0bbbbb 00000000
|
|
// 3 byte: xxbbbbbb 00000000
|
|
__m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00));
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2 byte: 0010bbbb bbcccccc
|
|
// 3 byte: 0010bbbb bbcccccc
|
|
__m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
|
|
|
|
__m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
|
|
// aaaabbbb bbcccccc
|
|
composed = __lsx_vbitsel_v(highperm, composed, v0fff);
|
|
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 4; // We wrote 4 16-bit codepoints
|
|
return consumed;
|
|
} else if (idx < 209) {
|
|
// THREE (3) input code-code units
|
|
if (input_utf8_end_of_code_point_mask == 0x888) {
|
|
// We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
|
|
// UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
|
|
// it is easier when we can assume they are all pairs. This version does
|
|
// not use the LUT, but 4 byte sequences are less common and the overhead
|
|
// of the extra memory access is less important than the early branch
|
|
// overhead in shorter sequences.
|
|
|
|
__m128i expected_mask =
|
|
(__m128i)v16u8{0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0,
|
|
0xf8, 0xc0, 0xc0, 0xc0, 0x0, 0x0, 0x0, 0x0};
|
|
__m128i expected =
|
|
(__m128i)v16u8{0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80,
|
|
0xf0, 0x80, 0x80, 0x80, 0x0, 0x0, 0x0, 0x0};
|
|
__m128i check = __lsx_vseq_b(__lsx_vand_v(in, expected_mask), expected);
|
|
if (__lsx_bz_b(check))
|
|
return 12;
|
|
// Swap byte pairs
|
|
// 10dddddd 10cccccc|10bbbbbb 11110aaa
|
|
// 10cccccc 10dddddd|11110aaa 10bbbbbb
|
|
__m128i swap = lsx_swap_bytes(in);
|
|
// Shift left 2 bits
|
|
// cccccc00 dddddd00 xxxxxxxx bbbbbb00
|
|
__m128i shift = __lsx_vslli_b(swap, 2);
|
|
// Create a magic number containing the low 2 bits of the trail surrogate
|
|
// and all the corrections needed to create the pair. UTF-8 4b prefix =
|
|
// -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6)
|
|
// surrogate high = +0x0000|0xD800
|
|
// surrogate low = +0xDC00|0x0000
|
|
// -------------------------------
|
|
// = +0xDC00|0xE7C0
|
|
__m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
|
|
// Generate unadjusted trail surrogate minus lowest 2 bits
|
|
// xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
|
|
__m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000ff00));
|
|
// Insert low 2 bits of trail surrogate to magic number for later
|
|
// 11011100 00000000 11100111 110000cc
|
|
__m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
|
|
|
|
// Generate lead surrogate
|
|
// xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
|
|
// 000000cc ccdddddd|xxxxxxxx xxxxxxxx
|
|
__m128i lead = __lsx_vbitsel_v(
|
|
__lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
|
|
__lsx_vrepli_h(0x3f /* 0x003f*/));
|
|
|
|
// Blend pairs
|
|
// 000000cc ccdddddd|11110aaa bbbbbb00
|
|
__m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF));
|
|
|
|
// Add magic number to finish the result
|
|
// 110111CC CCDDDDDD|110110AA BBBBBBCC
|
|
__m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
// __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(buffer), 0);
|
|
std::memcpy(utf16_output, buffer, 12);
|
|
utf16_output += 6; // We 3 32-bit surrogate pairs.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// 3 1-4 byte sequences
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// 1 byte: 00000000 00000000 00000000 0ddddddd
|
|
// 3 byte: 00000000 00000000 110ccccc 10dddddd
|
|
// 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
|
|
// 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
// added to fix issue https://github.com/simdutf/simdutf/issues/514
|
|
// We only want to write 2 * 16-bit code units when that is actually what we
|
|
// have. Unfortunately, we cannot trust the input. So it is possible to get
|
|
// 0xff as an input byte and it should not result in a surrogate pair. We
|
|
// need to check for that.
|
|
uint32_t permbuffer[4];
|
|
__lsx_vst(perm, permbuffer, 0);
|
|
// Mask the low and middle bytes
|
|
// 00000000 00000000 00000000 0ddddddd
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
|
|
// Because the surrogates need more work, the high surrogate is computed
|
|
// first.
|
|
__m128i middlehigh = __lsx_vslli_w(perm, 2);
|
|
// 00000000 00000000 00cccccc 00000000
|
|
__m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00));
|
|
// Start assembling the sequence. Since the 4th byte is in the same position
|
|
// as it would be in a surrogate and there is no dependency, shift left
|
|
// instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
|
|
// 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
|
|
__m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000));
|
|
// Top 16 bits contains the high ten bits of the surrogate pair before
|
|
// correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
|
|
// bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
|
|
__m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
|
|
__m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
|
|
// Combine the low 6 or 7 bits by a shift right accumulate
|
|
// 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
|
|
// 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
|
|
// correction
|
|
__m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
|
|
// After this is for surrogates
|
|
// Blend the low and high surrogates
|
|
// 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
|
|
__m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF));
|
|
// Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
|
|
// yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
|
|
// 11110aaa bbbbbbcc|000000cc ccdddddd
|
|
__m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
|
|
__m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
|
|
// Correct the remaining UTF-8 prefix, surrogate offset, and add the
|
|
// surrogate prefixes in one magic 16-bit addition. similar magic number but
|
|
// without the continue byte adjust and halfword swapped UTF-8 4b prefix =
|
|
// -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6)
|
|
// surrogate high = +0xD800|0x0000
|
|
// surrogate low = +0x0000|0xDC00
|
|
// -----------------------------------
|
|
// = +0xE7C0|0xDC00
|
|
__m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
|
|
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
|
|
__m128i surrogates = __lsx_vadd_w(masked_pair, magic);
|
|
// If the high bit is 1 (s32 less than zero), this needs a surrogate pair
|
|
__m128i is_pair = __lsx_vslt_w(perm, zero);
|
|
// Select either the 4 byte surrogate pair or the 2 byte solo codepoint
|
|
// 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
|
|
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
|
|
__m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
selected = lsx_swap_bytes(selected);
|
|
}
|
|
// Attempting to shuffle and store would be complex, just scalarize.
|
|
uint32_t buffer_tmp[4];
|
|
__lsx_vst(selected, buffer_tmp, 0);
|
|
// Test for the top bit of the surrogate mask. Remove due to issue 514
|
|
// const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
|
|
// 0x00800000;
|
|
for (size_t i = 0; i < 3; i++) {
|
|
// Surrogate
|
|
// Used to be if (buffer[i] & SURROGATE_MASK) {
|
|
// See discussion above.
|
|
// patch for issue https://github.com/simdutf/simdutf/issues/514
|
|
if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
|
|
utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
|
|
utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
return consumed;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
return 12;
|
|
}
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf8_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lsx/lsx_convert_utf8_to_utf32.cpp */
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_out) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xFFF;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
|
|
// We process in chunks of 16 bytes.
|
|
// use fast implementation in src/simdutf/arm64/simd.h
|
|
// Ideally the compiler can keep the tables in registers.
|
|
simd8<int8_t> temp{in};
|
|
temp.store_ascii_as_utf32_tbl(utf32_out);
|
|
utf32_output += 16; // We wrote 16 32-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
|
|
// UTF-32 code units. Convert to UTF-16
|
|
__m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
|
|
__m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
|
|
|
|
__lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
utf32_output += 4; // We wrote 4 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// 2 byte sequences occur in short bursts in languages like Greek and Russian.
|
|
if (input_utf8_end_of_code_point_mask == 0xaaa) {
|
|
// We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
|
|
// UTF-32 code units. Convert to UTF-16
|
|
__m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
|
|
|
|
__m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
|
|
__m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
|
|
|
|
__lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
__lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
|
|
utf32_output += 6;
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
/// Either no fast path or an unimportant fast path.
|
|
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// Convert to UTF-16
|
|
__m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
|
|
__m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
|
|
__m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
|
|
|
|
__lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
__lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
|
|
utf32_output += 6;
|
|
return consumed;
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
// UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// Shuffle
|
|
// 1 byte: 00000000 00000000 0ccccccc
|
|
// 2 byte: 00000000 110bbbbb 10cccccc
|
|
// 3 byte: 1110aaaa 10bbbbbb 10cccccc
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
// Split
|
|
// 00000000 00000000 0ccccccc
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
|
|
// Note: unmasked
|
|
// xxxxxxxx aaaaxxxx xxxxxxxx
|
|
__m128i high =
|
|
__lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
|
|
// Use 16 bit bic instead of and.
|
|
// The top bits will be corrected later in the bsl
|
|
// 00000000 10bbbbbb 00000000
|
|
__m128i middle =
|
|
__lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits
|
|
// Combine low and middle with shift right accumulate
|
|
// 00000000 00xxbbbb bbcccccc
|
|
__m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
|
|
// Insert top 4 bits from high byte with bitwise select
|
|
// 00000000 aaaabbbb bbcccccc
|
|
__m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000));
|
|
__lsx_vst(composed, utf32_output, 0);
|
|
utf32_output += 4; // We wrote 4 32-bit characters.
|
|
return consumed;
|
|
} else if (idx < 209) {
|
|
// THREE (3) input code-code units
|
|
if (input_utf8_end_of_code_point_mask == 0x888) {
|
|
// We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
|
|
// UTF-32 code units. This uses the same method as the fixed 3 byte
|
|
// version, reversing and shift left insert. However, there is no need for
|
|
// a shuffle mask now, just rev16 and rev32.
|
|
//
|
|
// This version does not use the LUT, but 4 byte sequences are less common
|
|
// and the overhead of the extra memory access is less important than the
|
|
// early branch overhead in shorter sequences, so it comes last.
|
|
|
|
// Swap pairs of bytes
|
|
// 10dddddd|10cccccc|10bbbbbb|11110aaa
|
|
// 10cccccc 10dddddd|11110aaa 10bbbbbb
|
|
__m128i swap = lsx_swap_bytes(in);
|
|
// Shift left and insert
|
|
// xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
|
|
__m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
|
|
__lsx_vrepli_h(0x3f /*0x003F*/));
|
|
// Shift insert again
|
|
// xxxxxxxx xxxaaabb bbbbcccc ccdddddd
|
|
__m128i merge2 =
|
|
__lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
|
|
__lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
|
|
lsx_splat_u32(0x00000FFF));
|
|
// Clear the garbage
|
|
// 00000000 000aaabb bbbbcccc ccdddddd
|
|
__m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF));
|
|
// Store
|
|
__lsx_vst(composed, utf32_output, 0);
|
|
utf32_output += 3; // We wrote 3 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
|
|
// due to surrogates no longer being involved.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// 1 byte: 00000000 00000000 00000000 0ddddddd
|
|
// 2 byte: 00000000 00000000 110ccccc 10dddddd
|
|
// 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
|
|
// 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
|
|
// Ascii
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
|
|
__m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00));
|
|
// 00000000 00000000 0000cccc ccdddddd
|
|
__m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii);
|
|
|
|
__m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000));
|
|
__m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
|
|
// Insert twice
|
|
// 00000000 000aaabb bbbbxxxx xxxxxxxx
|
|
__m128i corrected_srli2 =
|
|
__lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
|
|
__m128i ab =
|
|
__lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
|
|
ab = __lsx_vsrli_w(ab, 4);
|
|
// 00000000 000aaabb bbbbcccc ccdddddd
|
|
__m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF));
|
|
// Store
|
|
__lsx_vst(composed, utf32_output, 0);
|
|
utf32_output += 3; // We wrote 3 32-bit characters.
|
|
return consumed;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
return 12;
|
|
}
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf8_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lsx/lsx_convert_utf8_to_latin1.cpp */
|
|
size_t convert_masked_utf8_to_latin1(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char *&latin1_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
|
|
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
|
|
// We first try a few fast paths.
|
|
// The obvious first test is ASCII, which actually consumes the full 16.
|
|
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
|
|
// We process in chunks of 16 bytes
|
|
__lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
latin1_output += 16; // We wrote 16 18-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
/// We do not have a fast path available, or the fast path is unimportant, so
|
|
/// we fallback.
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
// this indicates an invalid input:
|
|
if (idx >= 64) {
|
|
return consumed;
|
|
}
|
|
// Here we should have (idx < 64), if not, there is a bug in the validation or
|
|
// elsewhere. SIX (6) input code-code units this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
|
|
// 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
|
|
// scenario we process SIX (6) input code-code units. The max length in bytes
|
|
// of six code code units spanning between 1 and 2 bytes each is 12 bytes.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// Shuffle
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 110aaaaa 10bbbbbb
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
|
|
// ascii mask
|
|
// 1 byte: 11111111 11111111
|
|
// 2 byte: 00000000 00000000
|
|
__m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
|
|
// utf8 mask
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 00111111 00111111
|
|
__m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
|
|
__lsx_vldi(0b00111111));
|
|
// mask
|
|
// 1 byte: 11111111 11111111
|
|
// 2 byte: 00111111 00111111
|
|
__m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
|
|
|
|
__m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
|
|
// writing 8 bytes even though we only care about the first 6 bytes.
|
|
__m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
|
|
|
|
uint64_t buffer[2];
|
|
// __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(buffer), 0);
|
|
std::memcpy(latin1_output, buffer, 6);
|
|
latin1_output += 6; // We wrote 6 bytes.
|
|
return consumed;
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf8_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lsx/lsx_convert_utf16_to_latin1.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
lsx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 16) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
in1 = lsx_swap_bytes(in1);
|
|
}
|
|
if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
|
|
// 1. pack the bytes
|
|
__m128i latin1_packed = __lsx_vpickev_b(in1, in);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
latin1_output += 16;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
lsx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 16) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
in1 = lsx_swap_bytes(in1);
|
|
}
|
|
if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
|
|
// 1. pack the bytes
|
|
__m128i latin1_packed = __lsx_vpickev_b(in1, in);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
latin1_output += 16;
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
for (int k = 0; k < 16; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf16_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/lsx/lsx_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char16_t *end = buf + len;
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
__m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
}
|
|
if (__lsx_bz_v(
|
|
__lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
|
|
// It is common enough that we have sequences of 16 consecutive ASCII
|
|
// characters.
|
|
__m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
|
|
if (!match_system(big_endian)) {
|
|
nextin = lsx_swap_bytes(nextin);
|
|
}
|
|
if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m128i utf8_packed = __lsx_vpickev_b(nextin, in);
|
|
// 2. store (16 bytes)
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m128i utf8_packed = __lsx_vpickev_b(in, in);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
}
|
|
}
|
|
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
__m128i t0 = __lsx_vslli_h(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
__m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
__m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
__m128i t3 = __lsx_vor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
__m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
|
|
__m128i t4 = __lsx_vor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m128i one_byte_bytemask =
|
|
__lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
|
|
__m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
|
|
// 4. pack the bytes
|
|
const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lsx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle = __lsx_vld(row, 1);
|
|
__m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
__m128i surrogates_bytemask = __lsx_vseq_h(
|
|
__lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (__lsx_bz_v(surrogates_bytemask)) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m128i t0 = __lsx_vpickev_b(in, in);
|
|
t0 = __lsx_vilvl_b(t0, t0);
|
|
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
|
|
__m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m128i t1 = __lsx_vand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m128i s0 = __lsx_vsrli_h(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m128i s1 = __lsx_vslli_h(in, 2);
|
|
// s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
|
|
s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00));
|
|
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m128i s2 = __lsx_vor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m128i s3 = __lsx_vor_v(s2, v_c0e0);
|
|
__m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
|
|
__m128i m0 =
|
|
__lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
|
|
__m128i s4 = __lsx_vxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m128i out0 = __lsx_vilvl_h(s4, t2);
|
|
__m128i out1 = __lsx_vilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
|
|
|
|
__m128i one_or_two_bytes_bytemask_low =
|
|
__lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m128i one_or_two_bytes_bytemask_high =
|
|
__lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m128i one_byte_bytemask_low =
|
|
__lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m128i one_byte_bytemask_high =
|
|
__lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
const uint32_t mask0 = __lsx_vpickve2gr_bu(
|
|
__lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
|
|
one_byte_bytemask_low)),
|
|
0);
|
|
const uint32_t mask1 = __lsx_vpickve2gr_bu(
|
|
__lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
|
|
one_byte_bytemask_high)),
|
|
0);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
|
|
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
}
|
|
if (__lsx_bz_v(
|
|
__lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
|
|
// It is common enough that we have sequences of 16 consecutive ASCII
|
|
// characters.
|
|
__m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
|
|
if (!match_system(big_endian)) {
|
|
nextin = lsx_swap_bytes(nextin);
|
|
}
|
|
if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m128i utf8_packed = __lsx_vpickev_b(nextin, in);
|
|
// 2. store (16 bytes)
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m128i utf8_packed = __lsx_vpickev_b(in, in);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
}
|
|
}
|
|
|
|
__m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
__m128i t0 = __lsx_vslli_h(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
__m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
__m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
__m128i t3 = __lsx_vor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
__m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
|
|
__m128i t4 = __lsx_vor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m128i one_byte_bytemask =
|
|
__lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
|
|
__m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
|
|
// 4. pack the bytes
|
|
const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lsx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle = __lsx_vld(row, 1);
|
|
__m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
__m128i surrogates_bytemask = __lsx_vseq_h(
|
|
__lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (__lsx_bz_v(surrogates_bytemask)) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m128i t0 = __lsx_vpickev_b(in, in);
|
|
t0 = __lsx_vilvl_b(t0, t0);
|
|
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
|
|
__m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m128i t1 = __lsx_vand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m128i s0 = __lsx_vsrli_h(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m128i s1 = __lsx_vslli_h(in, 2);
|
|
// s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
|
|
s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00));
|
|
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m128i s2 = __lsx_vor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m128i s3 = __lsx_vor_v(s2, v_c0e0);
|
|
__m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
|
|
__m128i m0 =
|
|
__lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
|
|
__m128i s4 = __lsx_vxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m128i out0 = __lsx_vilvl_h(s4, t2);
|
|
__m128i out1 = __lsx_vilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
|
|
|
|
__m128i one_or_two_bytes_bytemask_low =
|
|
__lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m128i one_or_two_bytes_bytemask_high =
|
|
__lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m128i one_byte_bytemask_low =
|
|
__lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m128i one_byte_bytemask_high =
|
|
__lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
const uint32_t mask0 = __lsx_vpickve2gr_bu(
|
|
__lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
|
|
one_byte_bytemask_low)),
|
|
0);
|
|
const uint32_t mask1 = __lsx_vpickve2gr_bu(
|
|
__lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
|
|
one_byte_bytemask_high)),
|
|
0);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
|
|
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf16_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lsx/lsx_convert_utf16_to_utf32.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char32_t *>
|
|
lsx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_out) {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
const char16_t *end = buf + len;
|
|
|
|
__m128i zero = __lsx_vldi(0);
|
|
__m128i v_f800 = lsx_splat_u16(0xf800);
|
|
__m128i v_d800 = lsx_splat_u16(0xd800);
|
|
|
|
while (end - buf >= 8) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
}
|
|
|
|
__m128i surrogates_bytemask =
|
|
__lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (__lsx_bz_v(surrogates_bytemask)) {
|
|
// case: no surrogate pairs, extend all 16-bit code units to 32-bit code
|
|
// units
|
|
__lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
|
|
__lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t *>
|
|
lsx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
|
|
char32_t *utf32_out) {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
__m128i zero = __lsx_vldi(0);
|
|
__m128i v_f800 = lsx_splat_u16(0xf800);
|
|
__m128i v_d800 = lsx_splat_u16(0xd800);
|
|
|
|
while (end - buf >= 8) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
}
|
|
|
|
__m128i surrogates_bytemask =
|
|
__lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
|
|
if (__lsx_bz_v(surrogates_bytemask)) {
|
|
// case: no surrogate pairs, extend all 16-bit code units to 32-bit code
|
|
// units
|
|
__lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
|
|
__lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf16_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lsx/lsx_convert_utf32_to_latin1.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
lsx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *end = buf + len;
|
|
const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
__m128i v_ff = __lsx_vrepli_w(0xFF);
|
|
|
|
while (end - buf >= 16) {
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
|
|
|
|
__m128i in12 = __lsx_vor_v(in1, in2);
|
|
if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
|
|
// 1. pack the bytes
|
|
__m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
lsx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
__m128i v_ff = __lsx_vrepli_w(0xFF);
|
|
|
|
while (end - buf >= 16) {
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
|
|
|
|
__m128i in12 = __lsx_vor_v(in1, in2);
|
|
|
|
if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
|
|
// 1. pack the bytes
|
|
__m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
latin1_output += 8;
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
for (int k = 0; k < 8; k++) {
|
|
uint32_t word = buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf32_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lsx/lsx_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char32_t *end = buf + len;
|
|
|
|
__m128i v_c080 = lsx_splat_u16(0xc080);
|
|
__m128i v_07ff = lsx_splat_u16(0x07ff);
|
|
__m128i v_dfff = lsx_splat_u16(0xdfff);
|
|
__m128i v_d800 = lsx_splat_u16(0xd800);
|
|
__m128i forbidden_bytemask = __lsx_vldi(0x0);
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
|
|
|
|
// Check if no bits set above 16th
|
|
if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
|
|
__m128i utf16_packed = __lsx_vpickev_h(nextin, in);
|
|
|
|
if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
|
|
utf16_packed))) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
continue; // we are done for this round!
|
|
}
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = __lsx_vor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = __lsx_vor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m128i one_byte_bytemask =
|
|
__lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
|
|
__m128i utf8_unpacked =
|
|
__lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
uint32_t m2 =
|
|
__lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lsx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle = __lsx_vld(row, 1);
|
|
__m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
} else {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
forbidden_bytemask = __lsx_vor_v(
|
|
__lsx_vand_v(
|
|
__lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single
|
|
UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
|
|
UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
|
|
t0 = __lsx_vilvl_b(t0, t0);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
__m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m128i t1 = __lsx_vand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m128i s1 = __lsx_vslli_h(utf16_packed, 2);
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00));
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m128i s2 = __lsx_vor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m128i s3 = __lsx_vor_v(s2, v_c0e0);
|
|
__m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
|
|
__m128i m0 =
|
|
__lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
|
|
__m128i s4 = __lsx_vxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m128i out0 = __lsx_vilvl_h(s4, t2);
|
|
__m128i out1 = __lsx_vilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m128i one_byte_bytemask =
|
|
__lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
|
|
|
|
__m128i one_or_two_bytes_bytemask_u16_to_u32_low =
|
|
__lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m128i one_or_two_bytes_bytemask_u16_to_u32_high =
|
|
__lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m128i one_byte_bytemask_u16_to_u32_low =
|
|
__lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m128i one_byte_bytemask_u16_to_u32_high =
|
|
__lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
const uint32_t mask0 =
|
|
__lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
|
|
one_or_two_bytes_bytemask_u16_to_u32_low,
|
|
one_byte_bytemask_u16_to_u32_low)),
|
|
0);
|
|
const uint32_t mask1 =
|
|
__lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
|
|
one_or_two_bytes_bytemask_u16_to_u32_high,
|
|
one_byte_bytemask_u16_to_u32_high)),
|
|
0);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
|
|
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
if (__lsx_bnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
__m128i v_c080 = lsx_splat_u16(0xc080);
|
|
__m128i v_07ff = lsx_splat_u16(0x07ff);
|
|
__m128i v_dfff = lsx_splat_u16(0xdfff);
|
|
__m128i v_d800 = lsx_splat_u16(0xd800);
|
|
__m128i forbidden_bytemask = __lsx_vldi(0x0);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
|
|
|
|
// Check if no bits set above 16th
|
|
if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
|
|
__m128i utf16_packed = __lsx_vpickev_h(nextin, in);
|
|
|
|
if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
|
|
utf16_packed))) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
continue; // we are done for this round!
|
|
}
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = __lsx_vor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = __lsx_vor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m128i one_byte_bytemask =
|
|
__lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
|
|
__m128i utf8_unpacked =
|
|
__lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
uint32_t m2 =
|
|
__lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
|
|
// 4. pack the bytes
|
|
const uint8_t *row =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lsx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle = __lsx_vld(row, 1);
|
|
__m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed, utf8_output, 0);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
} else {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
forbidden_bytemask = __lsx_vor_v(
|
|
__lsx_vand_v(
|
|
__lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
if (__lsx_bnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single
|
|
UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
|
|
UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
|
|
UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
|
|
t0 = __lsx_vilvl_b(t0, t0);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
__m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m128i t1 = __lsx_vand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m128i s1 = __lsx_vslli_h(utf16_packed, 2);
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00));
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m128i s2 = __lsx_vor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m128i s3 = __lsx_vor_v(s2, v_c0e0);
|
|
// __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
__m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
|
|
__m128i m0 =
|
|
__lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
|
|
__m128i s4 = __lsx_vxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m128i out0 = __lsx_vilvl_h(s4, t2);
|
|
__m128i out1 = __lsx_vilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m128i one_byte_bytemask =
|
|
__lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
|
|
|
|
__m128i one_or_two_bytes_bytemask_u16_to_u32_low =
|
|
__lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m128i one_or_two_bytes_bytemask_u16_to_u32_high =
|
|
__lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m128i one_byte_bytemask_u16_to_u32_low =
|
|
__lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m128i one_byte_bytemask_u16_to_u32_high =
|
|
__lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
const uint32_t mask0 =
|
|
__lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
|
|
one_or_two_bytes_bytemask_u16_to_u32_low,
|
|
one_byte_bytemask_u16_to_u32_low)),
|
|
0);
|
|
const uint32_t mask1 =
|
|
__lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
|
|
one_or_two_bytes_bytemask_u16_to_u32_high,
|
|
one_byte_bytemask_u16_to_u32_high)),
|
|
0);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
|
|
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
|
|
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf32_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lsx/lsx_convert_utf32_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t *, char16_t *>
|
|
lsx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
|
|
char16_t *utf16_out) {
|
|
uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
|
|
const char32_t *end = buf + len;
|
|
|
|
__m128i forbidden_bytemask = __lsx_vrepli_h(0);
|
|
__m128i v_d800 = lsx_splat_u16(0xd800);
|
|
__m128i v_dfff = lsx_splat_u16(0xdfff);
|
|
while (end - buf >= 8) {
|
|
__m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
|
|
|
|
// Check if no bits set above 16th
|
|
if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
|
|
__m128i utf16_packed = __lsx_vpickev_h(in1, in0);
|
|
forbidden_bytemask = __lsx_vor_v(
|
|
__lsx_vand_v(
|
|
__lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
|
|
if (!match_system(big_endian)) {
|
|
utf16_packed = lsx_swap_bytes(utf16_packed);
|
|
}
|
|
__lsx_vst(utf16_packed, utf16_output, 0);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 3;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(word >> 8 | word << 8)
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate =
|
|
uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (__lsx_bnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t *>
|
|
lsx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
|
|
char16_t *utf16_out) {
|
|
uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
__m128i forbidden_bytemask = __lsx_vrepli_h(0);
|
|
__m128i v_d800 = lsx_splat_u16(0xd800);
|
|
__m128i v_dfff = lsx_splat_u16(0xdfff);
|
|
|
|
while (end - buf >= 8) {
|
|
__m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
|
|
// Check if no bits set above 16th
|
|
if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
|
|
__m128i utf16_packed = __lsx_vpickev_h(in1, in0);
|
|
|
|
forbidden_bytemask = __lsx_vor_v(
|
|
__lsx_vand_v(
|
|
__lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
if (__lsx_bnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
|
|
if (!match_system(big_endian)) {
|
|
utf16_packed = lsx_swap_bytes(utf16_packed);
|
|
}
|
|
|
|
__lsx_vst(utf16_packed, utf16_output, 0);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 3;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(word >> 8 | word << 8)
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate =
|
|
uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
/* end file src/lsx/lsx_convert_utf32_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/lsx/lsx_base64.cpp */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
|
|
template <bool isbase64url>
|
|
size_t encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
// credit: Wojciech Muła
|
|
// SSE (lookup: pshufb improved unrolled)
|
|
const uint8_t *input = (const uint8_t *)src;
|
|
static const char *lookup_tbl =
|
|
isbase64url
|
|
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
|
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
uint8_t *out = (uint8_t *)dst;
|
|
|
|
v16u8 shuf;
|
|
__m128i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
|
|
base64_tbl2, base64_tbl3;
|
|
if (srclen >= 16) {
|
|
shuf = v16u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
|
|
v_fc0fc00 = __lsx_vreplgr2vr_w(uint32_t(0x0fc0fc00));
|
|
v_3f03f0 = __lsx_vreplgr2vr_w(uint32_t(0x003f03f0));
|
|
shift_r = __lsx_vreplgr2vr_w(uint32_t(0x0006000a));
|
|
shift_l = __lsx_vreplgr2vr_w(uint32_t(0x00080004));
|
|
base64_tbl0 = __lsx_vld(lookup_tbl, 0);
|
|
base64_tbl1 = __lsx_vld(lookup_tbl, 16);
|
|
base64_tbl2 = __lsx_vld(lookup_tbl, 32);
|
|
base64_tbl3 = __lsx_vld(lookup_tbl, 48);
|
|
}
|
|
|
|
size_t i = 0;
|
|
for (; i + 52 <= srclen; i += 48) {
|
|
__m128i in0 =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
|
|
__m128i in1 =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
|
|
__m128i in2 =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
|
|
__m128i in3 =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
|
|
|
|
in0 = __lsx_vshuf_b(in0, in0, (__m128i)shuf);
|
|
in1 = __lsx_vshuf_b(in1, in1, (__m128i)shuf);
|
|
in2 = __lsx_vshuf_b(in2, in2, (__m128i)shuf);
|
|
in3 = __lsx_vshuf_b(in3, in3, (__m128i)shuf);
|
|
|
|
__m128i t0_0 = __lsx_vand_v(in0, v_fc0fc00);
|
|
__m128i t0_1 = __lsx_vand_v(in1, v_fc0fc00);
|
|
__m128i t0_2 = __lsx_vand_v(in2, v_fc0fc00);
|
|
__m128i t0_3 = __lsx_vand_v(in3, v_fc0fc00);
|
|
|
|
__m128i t1_0 = __lsx_vsrl_h(t0_0, shift_r);
|
|
__m128i t1_1 = __lsx_vsrl_h(t0_1, shift_r);
|
|
__m128i t1_2 = __lsx_vsrl_h(t0_2, shift_r);
|
|
__m128i t1_3 = __lsx_vsrl_h(t0_3, shift_r);
|
|
|
|
__m128i t2_0 = __lsx_vand_v(in0, v_3f03f0);
|
|
__m128i t2_1 = __lsx_vand_v(in1, v_3f03f0);
|
|
__m128i t2_2 = __lsx_vand_v(in2, v_3f03f0);
|
|
__m128i t2_3 = __lsx_vand_v(in3, v_3f03f0);
|
|
|
|
__m128i t3_0 = __lsx_vsll_h(t2_0, shift_l);
|
|
__m128i t3_1 = __lsx_vsll_h(t2_1, shift_l);
|
|
__m128i t3_2 = __lsx_vsll_h(t2_2, shift_l);
|
|
__m128i t3_3 = __lsx_vsll_h(t2_3, shift_l);
|
|
|
|
__m128i input0 = __lsx_vor_v(t1_0, t3_0);
|
|
__m128i input0_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input0);
|
|
__m128i input0_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
|
|
__lsx_vsub_b(input0, __lsx_vldi(32)));
|
|
__m128i input0_mask = __lsx_vslei_bu(input0, 31);
|
|
__m128i input0_result =
|
|
__lsx_vbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
|
|
__lsx_vst(input0_result, reinterpret_cast<__m128i *>(out), 0);
|
|
out += 16;
|
|
|
|
__m128i input1 = __lsx_vor_v(t1_1, t3_1);
|
|
__m128i input1_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input1);
|
|
__m128i input1_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
|
|
__lsx_vsub_b(input1, __lsx_vldi(32)));
|
|
__m128i input1_mask = __lsx_vslei_bu(input1, 31);
|
|
__m128i input1_result =
|
|
__lsx_vbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
|
|
__lsx_vst(input1_result, reinterpret_cast<__m128i *>(out), 0);
|
|
out += 16;
|
|
|
|
__m128i input2 = __lsx_vor_v(t1_2, t3_2);
|
|
__m128i input2_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input2);
|
|
__m128i input2_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
|
|
__lsx_vsub_b(input2, __lsx_vldi(32)));
|
|
__m128i input2_mask = __lsx_vslei_bu(input2, 31);
|
|
__m128i input2_result =
|
|
__lsx_vbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
|
|
__lsx_vst(input2_result, reinterpret_cast<__m128i *>(out), 0);
|
|
out += 16;
|
|
|
|
__m128i input3 = __lsx_vor_v(t1_3, t3_3);
|
|
__m128i input3_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input3);
|
|
__m128i input3_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
|
|
__lsx_vsub_b(input3, __lsx_vldi(32)));
|
|
__m128i input3_mask = __lsx_vslei_bu(input3, 31);
|
|
__m128i input3_result =
|
|
__lsx_vbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
|
|
__lsx_vst(input3_result, reinterpret_cast<__m128i *>(out), 0);
|
|
out += 16;
|
|
}
|
|
for (; i + 16 <= srclen; i += 12) {
|
|
|
|
__m128i in = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
|
|
|
|
// bytes from groups A, B and C are needed in separate 32-bit lanes
|
|
// in = [DDDD|CCCC|BBBB|AAAA]
|
|
//
|
|
// an input triplet has layout
|
|
// [????????|ccdddddd|bbbbcccc|aaaaaabb]
|
|
// byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next
|
|
// triplet
|
|
//
|
|
// shuffling changes the order of bytes: 1, 0, 2, 1
|
|
// [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
|
|
// ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
|
|
// processed bits
|
|
in = __lsx_vshuf_b(in, in, (__m128i)shuf);
|
|
|
|
// unpacking
|
|
// t0 = [0000cccc|cc000000|aaaaaa00|00000000]
|
|
__m128i t0 = __lsx_vand_v(in, v_fc0fc00);
|
|
// t1 = [00000000|00cccccc|00000000|00aaaaaa]
|
|
// ((c >> 6), (a >> 10))
|
|
__m128i t1 = __lsx_vsrl_h(t0, shift_r);
|
|
|
|
// t2 = [00000000|00dddddd|000000bb|bbbb0000]
|
|
__m128i t2 = __lsx_vand_v(in, v_3f03f0);
|
|
// t3 = [00dddddd|00000000|00bbbbbb|00000000]
|
|
// ((d << 8), (b << 4))
|
|
__m128i t3 = __lsx_vsll_h(t2, shift_l);
|
|
|
|
// res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
|
|
__m128i indices = __lsx_vor_v(t1, t3);
|
|
|
|
__m128i indices_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, indices);
|
|
__m128i indices_shuf1 = __lsx_vshuf_b(
|
|
base64_tbl3, base64_tbl2, __lsx_vsub_b(indices, __lsx_vldi(32)));
|
|
__m128i indices_mask = __lsx_vslei_bu(indices, 31);
|
|
__m128i indices_result =
|
|
__lsx_vbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
|
|
|
|
__lsx_vst(indices_result, reinterpret_cast<__m128i *>(out), 0);
|
|
out += 16;
|
|
}
|
|
|
|
return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
|
|
srclen - i, options);
|
|
}
|
|
|
|
static inline void compress(__m128i data, uint16_t mask, char *output) {
|
|
if (mask == 0) {
|
|
__lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
|
|
return;
|
|
}
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
|
|
v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
|
|
tables::base64::thintable_epi8[mask2]};
|
|
|
|
// we increment by 0x08 the second half of the mask
|
|
v4u32 hi = {0, 0, 0x08080808, 0x08080808};
|
|
__m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
|
|
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = tables::base64::BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(
|
|
tables::base64::pshufb_combine_table + pop1 * 8),
|
|
0);
|
|
__m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
|
|
|
|
__lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
|
|
}
|
|
|
|
struct block64 {
|
|
__m128i chunks[4];
|
|
};
|
|
|
|
template <bool base64_url>
|
|
static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
|
|
const v16u8 ascii_space_tbl = {0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0};
|
|
// credit: aqrit
|
|
/*
|
|
'0'(0x30)-'9'(0x39) => delta_values_index = 4
|
|
'A'(0x41)-'Z'(0x5a) => delta_values_index = 4/5/12(4+8)
|
|
'a'(0x61)-'z'(0x7a) => delta_values_index = 6/7/14(6+8)
|
|
'+'(0x2b) => delta_values_index = 3
|
|
'/'(0x2f) => delta_values_index = 2+8 = 10
|
|
'-'(0x2d) => delta_values_index = 2+8 = 10
|
|
'_'(0x5f) => delta_values_index = 5+8 = 13
|
|
*/
|
|
v16u8 delta_asso = {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF};
|
|
v16i8 delta_values;
|
|
if (base64_url) {
|
|
delta_values =
|
|
v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
|
|
int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
|
|
int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
|
|
int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)};
|
|
} else {
|
|
delta_values =
|
|
v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
|
|
int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
|
|
int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
|
|
int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)};
|
|
}
|
|
|
|
v16u8 check_asso;
|
|
if (base64_url) {
|
|
check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x01, 0x03, 0x07, 0x0B, 0x06, 0x0B, 0x12};
|
|
} else {
|
|
check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F};
|
|
}
|
|
|
|
v16i8 check_values;
|
|
if (base64_url) {
|
|
check_values = v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80),
|
|
int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
|
|
int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
|
|
int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)};
|
|
} else {
|
|
check_values =
|
|
v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
|
|
int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
|
|
int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
|
|
int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)};
|
|
}
|
|
|
|
const __m128i shifted = __lsx_vsrli_b(*src, 3);
|
|
__m128i asso_index = __lsx_vand_v(*src, __lsx_vldi(0xF));
|
|
const __m128i delta_hash =
|
|
__lsx_vavgr_bu(__lsx_vshuf_b((__m128i)delta_asso, (__m128i)delta_asso,
|
|
(__m128i)asso_index),
|
|
shifted);
|
|
const __m128i check_hash =
|
|
__lsx_vavgr_bu(__lsx_vshuf_b((__m128i)check_asso, (__m128i)check_asso,
|
|
(__m128i)asso_index),
|
|
shifted);
|
|
|
|
const __m128i out =
|
|
__lsx_vsadd_b(__lsx_vshuf_b((__m128i)delta_values, (__m128i)delta_values,
|
|
(__m128i)delta_hash),
|
|
*src);
|
|
const __m128i chk =
|
|
__lsx_vsadd_b(__lsx_vshuf_b((__m128i)check_values, (__m128i)check_values,
|
|
(__m128i)check_hash),
|
|
*src);
|
|
unsigned int mask = __lsx_vpickve2gr_hu(__lsx_vmskltz_b(chk), 0);
|
|
if (mask) {
|
|
__m128i ascii_space = __lsx_vseq_b(__lsx_vshuf_b((__m128i)ascii_space_tbl,
|
|
(__m128i)ascii_space_tbl,
|
|
(__m128i)asso_index),
|
|
*src);
|
|
*error |=
|
|
(mask != __lsx_vpickve2gr_hu(__lsx_vmskltz_b((__m128i)ascii_space), 0));
|
|
}
|
|
|
|
*src = out;
|
|
return (uint16_t)mask;
|
|
}
|
|
|
|
template <bool base64_url>
|
|
static inline uint64_t to_base64_mask(block64 *b, bool *error) {
|
|
*error = 0;
|
|
uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
|
|
uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
|
|
uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], error);
|
|
uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], error);
|
|
return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
|
|
}
|
|
|
|
static inline void copy_block(block64 *b, char *output) {
|
|
__lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output), 0);
|
|
__lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output), 16);
|
|
__lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output), 32);
|
|
__lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output), 48);
|
|
}
|
|
|
|
static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
|
|
uint64_t nmask = ~mask;
|
|
uint64_t count =
|
|
__lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
|
|
uint16_t *count_ptr = (uint16_t *)&count;
|
|
compress(b->chunks[0], uint16_t(mask), output);
|
|
compress(b->chunks[1], uint16_t(mask >> 16), output + count_ptr[0]);
|
|
compress(b->chunks[2], uint16_t(mask >> 32),
|
|
output + count_ptr[0] + count_ptr[1]);
|
|
compress(b->chunks[3], uint16_t(mask >> 48),
|
|
output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
|
|
return count_ones(nmask);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 64 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
static inline void load_block(block64 *b, const char *src) {
|
|
b->chunks[0] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
|
|
b->chunks[1] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
|
|
b->chunks[2] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
|
|
b->chunks[3] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 128 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
static inline void load_block(block64 *b, const char16_t *src) {
|
|
__m128i m1 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
|
|
__m128i m2 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
|
|
__m128i m3 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
|
|
__m128i m4 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
|
|
__m128i m5 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 64);
|
|
__m128i m6 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 80);
|
|
__m128i m7 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 96);
|
|
__m128i m8 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 112);
|
|
b->chunks[0] = __lsx_vssrlni_bu_h(m2, m1, 0);
|
|
b->chunks[1] = __lsx_vssrlni_bu_h(m4, m3, 0);
|
|
b->chunks[2] = __lsx_vssrlni_bu_h(m6, m5, 0);
|
|
b->chunks[3] = __lsx_vssrlni_bu_h(m8, m7, 0);
|
|
}
|
|
|
|
static inline void base64_decode(char *out, __m128i str) {
|
|
__m128i t0 = __lsx_vor_v(
|
|
__lsx_vslli_w(str, 26),
|
|
__lsx_vslli_w(__lsx_vand_v(str, lsx_splat_u32(0x0000FF00)), 12));
|
|
__m128i t1 = __lsx_vsrli_w(__lsx_vand_v(str, lsx_splat_u32(0x003F0000)), 2);
|
|
__m128i t2 = __lsx_vor_v(t0, t1);
|
|
__m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16));
|
|
const v16u8 pack_shuffle = {3, 2, 1, 7, 6, 5, 11, 10,
|
|
9, 15, 14, 13, 0, 0, 0, 0};
|
|
t3 = __lsx_vshuf_b(t3, t3, (__m128i)pack_shuffle);
|
|
|
|
// Store the output:
|
|
// we only need 12.
|
|
__lsx_vstelm_d(t3, out, 0, 0);
|
|
__lsx_vstelm_w(t3, out + 8, 0, 2);
|
|
}
|
|
// decode 64 bytes and output 48 bytes
|
|
static inline void base64_decode_block(char *out, const char *src) {
|
|
base64_decode(out, __lsx_vld(reinterpret_cast<const __m128i *>(src), 0));
|
|
base64_decode(out + 12,
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(src), 16));
|
|
base64_decode(out + 24,
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(src), 32));
|
|
base64_decode(out + 36,
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(src), 48));
|
|
}
|
|
static inline void base64_decode_block_safe(char *out, const char *src) {
|
|
base64_decode_block(out, src);
|
|
}
|
|
static inline void base64_decode_block(char *out, block64 *b) {
|
|
base64_decode(out, b->chunks[0]);
|
|
base64_decode(out + 12, b->chunks[1]);
|
|
base64_decode(out + 24, b->chunks[2]);
|
|
base64_decode(out + 36, b->chunks[3]);
|
|
}
|
|
static inline void base64_decode_block_safe(char *out, block64 *b) {
|
|
base64_decode_block(out, b);
|
|
}
|
|
|
|
template <bool base64_url, bool ignore_garbage, typename char_type>
|
|
full_result
|
|
compress_decode_base64(char *dst, const char_type *src, size_t srclen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
size_t equallocation =
|
|
srclen; // location of the first padding character if any
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
size_t equalsigns = 0;
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 1;
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 2;
|
|
}
|
|
}
|
|
if (srclen == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
const char_type *const srcinit = src;
|
|
const char *const dstinit = dst;
|
|
const char_type *const srcend = src + srclen;
|
|
|
|
constexpr size_t block_size = 10;
|
|
char buffer[block_size * 64];
|
|
char *bufferptr = buffer;
|
|
if (srclen >= 64) {
|
|
const char_type *const srcend64 = src + srclen - 64;
|
|
while (src <= srcend64) {
|
|
block64 b;
|
|
load_block(&b, src);
|
|
src += 64;
|
|
bool error = false;
|
|
uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
|
|
if (badcharmask) {
|
|
if (error && !ignore_garbage) {
|
|
src -= 64;
|
|
while (src < srcend && scalar::base64::is_eight_byte(*src) &&
|
|
to_base64[uint8_t(*src)] <= 64) {
|
|
src++;
|
|
}
|
|
if (src < srcend) {
|
|
// should never happen
|
|
}
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
|
|
if (badcharmask != 0) {
|
|
// optimization opportunity: check for simple masks like those made of
|
|
// continuous 1s followed by continuous 0s. And masks containing a
|
|
// single bad character.
|
|
bufferptr += compress_block(&b, badcharmask, bufferptr);
|
|
} else {
|
|
// optimization opportunity: if bufferptr == buffer and mask == 0, we
|
|
// can avoid the call to compress_block and decode directly.
|
|
copy_block(&b, bufferptr);
|
|
bufferptr += 64;
|
|
}
|
|
if (bufferptr >= (block_size - 1) * 64 + buffer) {
|
|
for (size_t i = 0; i < (block_size - 1); i++) {
|
|
base64_decode_block(dst, buffer + i * 64);
|
|
dst += 48;
|
|
}
|
|
std::memcpy(buffer, buffer + (block_size - 1) * 64,
|
|
64); // 64 might be too much
|
|
bufferptr -= (block_size - 1) * 64;
|
|
}
|
|
}
|
|
}
|
|
char *buffer_start = buffer;
|
|
// Optimization note: if this is almost full, then it is worth our
|
|
// time, otherwise, we should just decode directly.
|
|
int last_block = (int)((bufferptr - buffer_start) % 64);
|
|
if (last_block != 0 && srcend - src + last_block >= 64) {
|
|
while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
|
|
uint8_t val = to_base64[uint8_t(*src)];
|
|
*bufferptr = char(val);
|
|
if ((!scalar::base64::is_eight_byte(*src) || val > 64) &&
|
|
!ignore_garbage) {
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
bufferptr += (val <= 63);
|
|
src++;
|
|
}
|
|
}
|
|
|
|
for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
|
|
base64_decode_block(dst, buffer_start);
|
|
dst += 48;
|
|
}
|
|
if ((bufferptr - buffer_start) % 64 != 0) {
|
|
while (buffer_start + 4 < bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
std::memcpy(dst, &triple, 4);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
if (buffer_start + 4 <= bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
// we may have 1, 2 or 3 bytes left and we need to decode them so let us
|
|
// backtrack
|
|
int leftover = int(bufferptr - buffer_start);
|
|
while (leftover > 0) {
|
|
if (!ignore_garbage) {
|
|
while (to_base64[uint8_t(*(src - 1))] == 64) {
|
|
src--;
|
|
}
|
|
} else {
|
|
while (to_base64[uint8_t(*(src - 1))] >= 64) {
|
|
src--;
|
|
}
|
|
}
|
|
src--;
|
|
leftover--;
|
|
}
|
|
}
|
|
if (src < srcend + equalsigns) {
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
dst, src, srcend - src, equalsigns, options, last_chunk_options);
|
|
r.input_count += size_t(src - srcinit);
|
|
if (r.error == error_code::INVALID_BASE64_CHARACTER ||
|
|
r.error == error_code::BASE64_EXTRA_BITS) {
|
|
return r;
|
|
} else {
|
|
r.output_count += size_t(dst - dstinit);
|
|
}
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
r.error = error_code::INVALID_BASE64_CHARACTER;
|
|
r.input_count = equallocation;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
if (equalsigns > 0 && !ignore_garbage) {
|
|
if ((size_t(dst - dstinit) % 3 == 0) ||
|
|
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
/* end file src/lsx/lsx_base64.cpp */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with
|
|
// spaces
|
|
template <size_t STEP_SIZE> struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0
|
|
* (in which case this function fills the buffer with spaces and returns 0. In
|
|
* particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
|
|
* block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text_64(const uint8_t *text) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t *>(buf));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') {
|
|
buf[i] = '_';
|
|
}
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char *format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
|
|
for (size_t i = 0; i < 64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline
|
|
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
|
|
: buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
|
|
idx{0} {}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
|
|
return idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *
|
|
buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t
|
|
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if (len == idx) {
|
|
return 0;
|
|
} // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20,
|
|
STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
|
|
// to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the
|
|
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they
|
|
// ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
0b11110000u - 1,
|
|
0b11100000u - 1,
|
|
0b11000000u - 1};
|
|
const simd8<uint8_t> max_value(
|
|
&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast
|
|
// path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is
|
|
// too short or a byte value too large in the last bytes: check_special_cases
|
|
// only checks for bytes too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an
|
|
// ASCII block can't possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
|
|
if (simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete =
|
|
is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template <class checker>
|
|
bool generic_validate_utf8(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char *input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template <class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char *input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/generic/ascii_validation.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace ascii_validation {
|
|
|
|
bool generic_validate_ascii(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
} // namespace ascii_validation
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/ascii_validation.h */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
// transcoding from UTF-8 to Latin 1
|
|
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// For UTF-8 to Latin 1, we can allow any ASCII character, and any
|
|
// continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
|
|
// 0b11000010 and nothing else.
|
|
//
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
constexpr const uint8_t FORBIDDEN = 0xff;
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
FORBIDDEN);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
FORBIDDEN,
|
|
// ____0101 ________
|
|
FORBIDDEN,
|
|
// ____011_ ________
|
|
FORBIDDEN, FORBIDDEN,
|
|
|
|
// ____1___ ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
|
|
// ____1101 ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
this->error |= check_special_cases(input, prev1);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 16; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
latin1_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin1_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t convert_valid(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
|
|
// 16 bytes, and if the data is valid, then it is entirely safe because 16
|
|
// UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
|
|
// assume that you have valid UTF-8 input, so we are going to go back from the
|
|
// end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
|
|
latin1_output);
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
// namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
// transcoding from UTF-8 to UTF-16
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char16_t *utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the
|
|
// generic directory.
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the
|
|
// mask far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow
|
|
// path. Anything that is not a continuation mask is a 'leading byte',
|
|
// that is, the start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end*
|
|
// of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
input + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
|
|
input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(
|
|
in + pos, size - pos, utf16_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 2;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
|
|
size_t iterations = 0;
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + N <= size; pos += N) {
|
|
const auto input =
|
|
vector_i8::load(reinterpret_cast<const int8_t *>(in + pos));
|
|
|
|
const auto continuation = input > int8_t(-65);
|
|
const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240);
|
|
|
|
local -= vector_u8(continuation);
|
|
local -= vector_u8(utf_4bytes);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
// transcoding from UTF-8 to UTF-32
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char32_t *utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while (pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
input + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
|
|
utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // we have an error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/generic/utf8.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.gt(-65);
|
|
count += count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 4;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
size_t iterations = 0;
|
|
for (; pos + 4 * N <= size; pos += 4 * N) {
|
|
const auto input0 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
|
|
const auto input1 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
|
|
const auto input2 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
|
|
const auto input3 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
|
|
const auto mask0 = input0 > int8_t(-65);
|
|
const auto mask1 = input1 > int8_t(-65);
|
|
const auto mask2 = input2 > int8_t(-65);
|
|
const auto mask3 = input3 > int8_t(-65);
|
|
|
|
local -= vector_u8(mask0);
|
|
local -= vector_u8(mask1);
|
|
local -= vector_u8(mask2);
|
|
local -= vector_u8(mask3);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
#endif // SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/generic/utf16/count_code_points_bytemask.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t *in,
|
|
size_t size) {
|
|
using vector_u16 = simd16<uint16_t>;
|
|
constexpr size_t N = vector_u16::ELEMENTS;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
constexpr size_t max_itertions = 65535;
|
|
const auto one = vector_u16::splat(1);
|
|
const auto zero = vector_u16::zero();
|
|
|
|
size_t itertion = 0;
|
|
|
|
auto counters = zero;
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vector_u16::load(in + pos);
|
|
if (!match_system(big_endian)) {
|
|
input = input.swap_bytes();
|
|
}
|
|
|
|
const auto t0 = input & uint16_t(0xfc00);
|
|
const auto t1 = t0 ^ uint16_t(0xdc00);
|
|
|
|
// t2[0] == 1 iff input[0] outside range 0xdc00..dfff (the word is not a
|
|
// high surrogate)
|
|
const auto t2 = min(t1, one);
|
|
|
|
counters += t2;
|
|
|
|
itertion += 1;
|
|
if (itertion == max_itertions) {
|
|
count += counters.sum();
|
|
counters = zero;
|
|
itertion = 0;
|
|
}
|
|
}
|
|
|
|
if (itertion > 0) {
|
|
count += counters.sum();
|
|
}
|
|
|
|
return count +
|
|
scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/count_code_points_bytemask.h */
|
|
/* begin file src/generic/utf16/change_endianness.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
simdutf_really_inline void
|
|
change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos < size / 32 * 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/change_endianness.h */
|
|
/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
|
|
using vector_u16 = simd16<uint16_t>;
|
|
constexpr size_t N = vector_u16::ELEMENTS;
|
|
|
|
const auto one = vector_u16::splat(1);
|
|
|
|
auto v_count = vector_u16::zero();
|
|
|
|
// each char16 yields at least one byte
|
|
size_t count = size / N * N;
|
|
|
|
// in a single iteration the increment is 0, 1 or 2, despite we have
|
|
// three additions
|
|
constexpr size_t max_iterations = 65535 / 2;
|
|
size_t iteration = max_iterations;
|
|
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vector_u16::load(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input = input.swap_bytes();
|
|
}
|
|
// 0xd800 .. 0xdbff - low surrogate
|
|
// 0xdc00 .. 0xdfff - high surrogate
|
|
const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
|
|
|
|
// c0 - chars that yield 2- or 3-byte UTF-8 codes
|
|
const auto c0 = min(input & uint16_t(0xff80), one);
|
|
|
|
// c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
|
|
const auto c1 = min(input & uint16_t(0xf800), one);
|
|
|
|
/*
|
|
Explanation how the counting works.
|
|
|
|
In the case of a non-surrogate character we count:
|
|
* always 1 -- see how `count` is initialized above;
|
|
* c0 = 1 if the current char yields 2 or 3 bytes;
|
|
* c1 = 1 if the current char yields 3 bytes.
|
|
|
|
Thus, we always have correct count for the current char:
|
|
from 1, 2 or 3 bytes.
|
|
|
|
A trickier part is how we count surrogate pairs. Whether
|
|
we encounter a surrogate (low or high), we count it as
|
|
3 chars and then minus 1 (`is_surrogate` is -1 or 0).
|
|
Each surrogate char yields 2. A surrogate pair, that
|
|
is a low surrogate followed by a high one, yields
|
|
the expected 4 bytes.
|
|
|
|
It also correctly handles cases when low surrogate is
|
|
processed by the this loop, but high surrogate is counted
|
|
by the scalar procedure. The scalar procedure uses exactly
|
|
the described approach, thanks to that for valid UTF-16
|
|
strings it always count correctly.
|
|
*/
|
|
v_count += c0;
|
|
v_count += c1;
|
|
v_count += vector_u16(is_surrogate);
|
|
|
|
iteration -= 1;
|
|
if (iteration == 0) {
|
|
count += v_count.sum();
|
|
v_count = vector_u16::zero();
|
|
iteration = max_iterations;
|
|
}
|
|
}
|
|
|
|
if (iteration > 0) {
|
|
count += v_count.sum();
|
|
}
|
|
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
/* begin file src/generic/utf16/utf32_length_from_utf16.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/utf32_length_from_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/validate_utf16.h */
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf16 {
|
|
/*
|
|
UTF-16 validation
|
|
--------------------------------------------------
|
|
|
|
In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
|
|
|
|
In a vectorized algorithm we want to examine the most significant
|
|
nibble in order to select a fast path. If none of highest nibbles
|
|
are 0xD (13), than we are sure that UTF-16 chunk in a vector
|
|
register is valid.
|
|
|
|
Let us analyze what we need to check if the nibble is 0xD. The
|
|
value of the preceding nibble determines what we have:
|
|
|
|
0xd000 .. 0xd7ff - a valid word
|
|
0xd800 .. 0xdbff - low surrogate
|
|
0xdc00 .. 0xdfff - high surrogate
|
|
|
|
Other constraints we have to consider:
|
|
- there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
|
|
- there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
|
|
- there must not be sole low surrogate nor high surrogate
|
|
|
|
We are going to build three bitmasks based on the 3rd nibble:
|
|
- V = valid word,
|
|
- L = low surrogate (0xd800 .. 0xdbff)
|
|
- H = high surrogate (0xdc00 .. 0xdfff)
|
|
|
|
0 1 2 3 4 5 6 7 <--- word index
|
|
[ V | L | H | L | H | V | V | L ]
|
|
1 0 0 0 0 1 1 0 - V = valid masks
|
|
0 1 0 1 0 0 0 1 - L = low surrogate
|
|
0 0 1 0 1 0 0 0 - H high surrogate
|
|
|
|
|
|
1 0 0 0 0 1 1 0 V = valid masks
|
|
0 1 0 1 0 0 0 0 a = L & (H >> 1)
|
|
0 0 1 0 1 0 0 0 b = a << 1
|
|
1 1 1 1 1 1 1 0 c = V | a | b
|
|
^
|
|
the last bit can be zero, we just consume 7
|
|
code units and recheck this word in the next iteration
|
|
*/
|
|
template <endianness big_endian>
|
|
const result validate_utf16_with_errors(const char16_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char16_t *start = input;
|
|
const char16_t *end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 =
|
|
simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
// Function `utf16_gather_high_bytes` consumes two vectors of UTF-16
|
|
// and yields a single vector having only higher bytes of characters.
|
|
const auto in = utf16_gather_high_bytes<big_endian>(in0, in1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0000) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher byte)
|
|
|
|
// V - non-surrogate code units
|
|
// V = not surrogates_wordmask
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(
|
|
L & (H >> 1)); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint16_t b = static_cast<uint16_t>(
|
|
a << 1); // Just mark that the opinput - startite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint16_t c = static_cast<uint16_t>(
|
|
V | a | b); // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single code units or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
// The 15 lower code units of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/generic/utf32.h */
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
template <typename T> T min(T a, T b) { return a <= b ? a : b; }
|
|
|
|
simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
|
|
size_t length) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const char32_t *start = input;
|
|
|
|
// we add up to three ones in a single iteration (see the vectorized loop in
|
|
// section #2 below)
|
|
const size_t max_increment = 3;
|
|
|
|
const size_t N = vector_u32::ELEMENTS;
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
const auto v_0000007f = vector_u32::splat(0x0000007f);
|
|
const auto v_000007ff = vector_u32::splat(0x000007ff);
|
|
const auto v_0000ffff = vector_u32::splat(0x0000ffff);
|
|
#else
|
|
const auto v_ffffff80 = vector_u32::splat(0xffffff80);
|
|
const auto v_fffff800 = vector_u32::splat(0xfffff800);
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
const auto one = vector_u32::splat(1);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
size_t counter = 0;
|
|
|
|
// 1. vectorized loop unrolled 4 times
|
|
{
|
|
// we use vector of uint32 counters, this is why this limit is used
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / (max_increment * 4);
|
|
size_t blocks = length / (N * 4);
|
|
length -= blocks * (N * 4);
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
simd32<uint32_t> acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in0 = vector_u32(input + 0 * N);
|
|
const auto in1 = vector_u32(input + 1 * N);
|
|
const auto in2 = vector_u32(input + 2 * N);
|
|
const auto in3 = vector_u32(input + 3 * N);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in0 > v_0000007f);
|
|
acc -= as_vector_u32(in1 > v_0000007f);
|
|
acc -= as_vector_u32(in2 > v_0000007f);
|
|
acc -= as_vector_u32(in3 > v_0000007f);
|
|
|
|
acc -= as_vector_u32(in0 > v_000007ff);
|
|
acc -= as_vector_u32(in1 > v_000007ff);
|
|
acc -= as_vector_u32(in2 > v_000007ff);
|
|
acc -= as_vector_u32(in3 > v_000007ff);
|
|
|
|
acc -= as_vector_u32(in0 > v_0000ffff);
|
|
acc -= as_vector_u32(in1 > v_0000ffff);
|
|
acc -= as_vector_u32(in2 > v_0000ffff);
|
|
acc -= as_vector_u32(in3 > v_0000ffff);
|
|
#else
|
|
acc += min(one, in0 & v_ffffff80);
|
|
acc += min(one, in1 & v_ffffff80);
|
|
acc += min(one, in2 & v_ffffff80);
|
|
acc += min(one, in3 & v_ffffff80);
|
|
|
|
acc += min(one, in0 & v_fffff800);
|
|
acc += min(one, in1 & v_fffff800);
|
|
acc += min(one, in2 & v_fffff800);
|
|
acc += min(one, in3 & v_fffff800);
|
|
|
|
acc += min(one, in0 & v_ffff0000);
|
|
acc += min(one, in1 & v_ffff0000);
|
|
acc += min(one, in2 & v_ffff0000);
|
|
acc += min(one, in3 & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += 4 * N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
// 2. vectorized loop for tail
|
|
{
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / max_increment;
|
|
size_t blocks = length / N;
|
|
length -= blocks * N;
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
auto acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in = vector_u32(input);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in > v_0000007f);
|
|
acc -= as_vector_u32(in > v_000007ff);
|
|
acc -= as_vector_u32(in > v_0000ffff);
|
|
#else
|
|
acc += min(one, in & v_ffffff80);
|
|
acc += min(one, in & v_fffff800);
|
|
acc += min(one, in & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
if (consumed != 0) {
|
|
// We don't count 0th bytes in the vectorized loops above, this
|
|
// is why we need to count them in the end.
|
|
counter += consumed;
|
|
}
|
|
|
|
return counter + scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdutf {
|
|
namespace lsx {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
// todo: reimplement as a one-pass algorithm.
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
int out = 0;
|
|
if (validate_utf8(input, length)) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if ((length % 2) == 0) {
|
|
if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
|
|
length / 2)) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
}
|
|
if ((length % 4) == 0) {
|
|
if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return lsx::utf8_validation::generic_validate_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return lsx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return lsx::ascii_validation::generic_validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return lsx::ascii_validation::generic_validate_ascii_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const auto res =
|
|
lsx::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count != len) {
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const auto res =
|
|
lsx::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count != len) {
|
|
return scalar::utf16::validate<endianness::BIG>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
const result res =
|
|
lsx::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
const result scalar_res =
|
|
scalar::utf16::validate_with_errors<endianness::LITTLE>(
|
|
buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
const result res =
|
|
lsx::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
const result scalar_res =
|
|
scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count,
|
|
len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(input, len,
|
|
output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::BIG>(input, len,
|
|
output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const char32_t *tail = lsx_validate_utf32le(buf, len);
|
|
if (tail) {
|
|
return scalar::utf32::validate(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
result res = lsx_validate_utf32le_with_errors(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char *, char *> ret =
|
|
lsx_convert_latin1_to_utf8(buf, len, utf8_output);
|
|
size_t converted_chars = ret.second - utf8_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
lsx_convert_latin1_to_utf16le(buf, len, utf16_output);
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
lsx_convert_latin1_to_utf16be(buf, len, utf16_output);
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char *, char32_t *> ret =
|
|
lsx_convert_latin1_to_utf32(buf, len, utf32_output);
|
|
size_t converted_chars = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
return lsx::utf8_to_latin1::convert_valid(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *input, size_t size, char32_t *utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lsx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lsx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
lsx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
|
|
buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
lsx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lsx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lsx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
lsx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
|
|
utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
lsx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
|
|
utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return 0;
|
|
}
|
|
std::pair<const char32_t *, char *> ret =
|
|
lsx_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
lsx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
lsx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
lsx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
lsx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
lsx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
|
|
utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
lsx_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
lsx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
lsx_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
lsx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
lsx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
lsx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
lsx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t length) const noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
|
|
const uint8_t *data_end = data + length;
|
|
uint64_t result = 0;
|
|
while (data_end - data > 16) {
|
|
uint64_t two_bytes = 0;
|
|
__m128i input_vec = __lsx_vld(data, 0);
|
|
two_bytes =
|
|
__lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
|
|
result += 16 + two_bytes;
|
|
data += 16;
|
|
}
|
|
return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
|
|
data_end - data);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::LITTLE>(input,
|
|
length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8_bytemask(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
const __m128i v_ffff = lsx_splat_u32(0x0000ffff);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 4 <= length; pos += 4) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
|
|
const __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
|
|
size_t surrogate_count = __lsx_vpickve2gr_bu(
|
|
__lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
|
|
count += 4 + surrogate_count;
|
|
}
|
|
return count +
|
|
scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
if (options & base64_url) {
|
|
return encode_base64<true>(output, input, length, options);
|
|
} else {
|
|
return encode_base64<false>(output, input, length, options);
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
} // namespace lsx
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/lsx/end.h */
|
|
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
/* end file src/simdutf/lsx/end.h */
|
|
/* end file src/lsx/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_LASX
|
|
/* begin file src/lasx/implementation.cpp */
|
|
/* begin file src/simdutf/lasx/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "lasx"
|
|
// #define SIMDUTF_IMPLEMENTATION lasx
|
|
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
|
|
/* end file src/simdutf/lasx/begin.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
#ifndef SIMDUTF_LASX_H
|
|
#error "lasx.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
// convert vmskltz/vmskgez/vmsknz to
|
|
// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
|
|
const uint8_t lasx_1_2_utf8_bytes_mask[] = {
|
|
0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84,
|
|
85, 2, 3, 6, 7, 18, 19, 22, 23, 66, 67, 70, 71, 82, 83,
|
|
86, 87, 8, 9, 12, 13, 24, 25, 28, 29, 72, 73, 76, 77, 88,
|
|
89, 92, 93, 10, 11, 14, 15, 26, 27, 30, 31, 74, 75, 78, 79,
|
|
90, 91, 94, 95, 32, 33, 36, 37, 48, 49, 52, 53, 96, 97, 100,
|
|
101, 112, 113, 116, 117, 34, 35, 38, 39, 50, 51, 54, 55, 98, 99,
|
|
102, 103, 114, 115, 118, 119, 40, 41, 44, 45, 56, 57, 60, 61, 104,
|
|
105, 108, 109, 120, 121, 124, 125, 42, 43, 46, 47, 58, 59, 62, 63,
|
|
106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
|
|
149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
|
|
150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
|
|
153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
|
|
154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
|
|
165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
|
|
166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
|
|
169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
|
|
170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
|
|
255};
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
|
|
return __lsx_vshuf4i_b(vec, 0b10110001);
|
|
}
|
|
simdutf_really_inline __m256i lasx_swap_bytes(__m256i vec) {
|
|
return __lasx_xvshuf4i_b(vec, 0b10110001);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \
|
|
SIMDUTF_FEATURE_UTF8
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
|
|
return input.is_ascii();
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING ||
|
|
// SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_really_inline simd8<bool>
|
|
must_be_2_3_continuation(const simd8<uint8_t> prev2,
|
|
const simd8<uint8_t> prev3) {
|
|
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
|
|
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
|
|
return is_third_byte ^ is_fourth_byte;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32)
|
|
// common functions for utf8 conversions
|
|
simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
|
|
// Low half contains 10bbbbbb|10cccccc
|
|
// High half contains 1110aaaa|1110aaaa
|
|
const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
|
|
const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
|
|
|
|
__m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
|
|
// 1110aaaa => aaaa0000
|
|
__m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
|
|
// 10bbbbbb 10cccccc => 0010bbbb bbcccccc
|
|
__m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
|
|
perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
|
|
// 0010bbbb bbcccccc => aaaabbbb bbcccccc
|
|
composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
|
|
|
|
return composed;
|
|
}
|
|
|
|
simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
|
|
// 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
|
|
__m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
|
|
// 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
|
|
composed = __lsx_vbitsel_v(
|
|
__lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
|
|
__lsx_vsrli_h(composed, 8), /* bbbbbb >> 8 */
|
|
__lsx_vrepli_h(0x3f)); /* 0x003f */
|
|
return composed;
|
|
}
|
|
|
|
simdutf_really_inline __m128i
|
|
convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
|
|
// Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
|
|
// This is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes.
|
|
__m128i sh =
|
|
__lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
|
|
0);
|
|
// Shuffle
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 110aaaaa 10bbbbbb
|
|
__m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000000 00bbbbbb
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 00000aaa aa000000
|
|
__m128i v1f00 = lsx_splat_u16(0x1f00);
|
|
__m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
|
|
// Combine with a shift right accumulate
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 00000aaa aabbbbbb
|
|
composed = __lsx_vadd_h(ascii, composed);
|
|
return composed;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && (SIMDUTF_FEATURE_UTF16 ||
|
|
// SIMDUTF_FEATURE_UTF32)
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/lasx/lasx_validate_utf16.cpp */
|
|
template <endianness big_endian>
|
|
simd8<uint8_t> utf16_gather_high_bytes(const simd16<uint16_t> in0,
|
|
const simd16<uint16_t> in1) {
|
|
if (big_endian) {
|
|
const auto mask = simd16<uint16_t>(0x00ff);
|
|
const auto t0 = in0 & mask;
|
|
const auto t1 = in1 & mask;
|
|
|
|
return simd16<uint16_t>::pack(t0, t1);
|
|
} else {
|
|
return simd16<uint16_t>::pack_shifted_right<8>(in0, in1);
|
|
}
|
|
}
|
|
/* end file src/lasx/lasx_validate_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/lasx/lasx_validate_utf32le.cpp */
|
|
const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) {
|
|
const char32_t *end = input + size;
|
|
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)input & 0x1F) && input < end) {
|
|
uint32_t word = *input++;
|
|
if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
__m256i offset = lasx_splat_u32(0xffff2000);
|
|
__m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff);
|
|
__m256i standardmax = lasx_splat_u32(0x10ffff);
|
|
__m256i currentmax = __lasx_xvldi(0x0);
|
|
__m256i currentoffsetmax = __lasx_xvldi(0x0);
|
|
|
|
while (input + 8 < end) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
|
|
currentmax = __lasx_xvmax_wu(in, currentmax);
|
|
// 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
|
|
currentoffsetmax =
|
|
__lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
|
|
input += 8;
|
|
}
|
|
__m256i is_zero =
|
|
__lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
|
|
if (__lasx_xbnz_v(is_zero)) {
|
|
return nullptr;
|
|
}
|
|
|
|
is_zero = __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
|
|
standardoffsetmax);
|
|
if (__lasx_xbnz_v(is_zero)) {
|
|
return nullptr;
|
|
}
|
|
return input;
|
|
}
|
|
|
|
const result lasx_validate_utf32le_with_errors(const char32_t *input,
|
|
size_t size) {
|
|
const char32_t *start = input;
|
|
const char32_t *end = input + size;
|
|
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)input & 0x1F) && input < end) {
|
|
uint32_t word = *input;
|
|
if (word > 0x10FFFF) {
|
|
return result(error_code::TOO_LARGE, input - start);
|
|
}
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
input++;
|
|
}
|
|
|
|
__m256i offset = lasx_splat_u32(0xffff2000);
|
|
__m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff);
|
|
__m256i standardmax = lasx_splat_u32(0x10ffff);
|
|
__m256i currentmax = __lasx_xvldi(0x0);
|
|
__m256i currentoffsetmax = __lasx_xvldi(0x0);
|
|
|
|
while (input + 8 < end) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
|
|
currentmax = __lasx_xvmax_wu(in, currentmax);
|
|
currentoffsetmax =
|
|
__lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
|
|
|
|
__m256i is_zero =
|
|
__lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
|
|
if (__lasx_xbnz_v(is_zero)) {
|
|
return result(error_code::TOO_LARGE, input - start);
|
|
}
|
|
is_zero =
|
|
__lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
|
|
standardoffsetmax);
|
|
if (__lasx_xbnz_v(is_zero)) {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
input += 8;
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/lasx/lasx_validate_utf32le.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lasx/lasx_convert_latin1_to_utf8.cpp */
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
|
|
std::pair<const char *, char *>
|
|
lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const size_t safety_margin = 12;
|
|
const char *end = latin1_input + len;
|
|
|
|
// We always write 16 bytes, of which more than the first 8 bytes
|
|
// are valid. A safety margin of 8 is more than sufficient.
|
|
while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
|
|
uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0);
|
|
if (ascii_mask == 0xFFFF) {
|
|
__lsx_vst(in8, utf8_output, 0);
|
|
utf8_output += 16;
|
|
latin1_input += 16;
|
|
continue;
|
|
}
|
|
// We just fallback on UTF-16 code. This could be optimized/simplified
|
|
// further.
|
|
__m256i in16 = __lasx_vext2xv_hu_bu(____m256i(in8));
|
|
// 1. prepare 2-byte values
|
|
// input 8-bit word : [aabb|bbbb] x 16
|
|
// expected output : [1100|00aa|10bb|bbbb] x 16
|
|
// t0 = [0000|00aa|bbbb|bb00]
|
|
__m256i t0 = __lasx_xvslli_h(in16, 2);
|
|
// t1 = [0000|00aa|0000|0000]
|
|
__m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x300));
|
|
// t3 = [0000|00aa|00bb|bbbb]
|
|
__m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f));
|
|
// t4 = [1100|00aa|10bb|bbbb]
|
|
__m256i t3 = __lasx_xvor_v(t2, __lasx_xvreplgr2vr_h(uint16_t(0xc080)));
|
|
// merge ASCII and 2-byte codewords
|
|
__m256i one_byte_bytemask = __lasx_xvsle_hu(in16, __lasx_xvrepli_h(0x7F));
|
|
__m256i utf8_unpacked = __lasx_xvbitsel_v(t3, in16, one_byte_bytemask);
|
|
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[(ascii_mask & 0xFF)]][0];
|
|
__m128i shuffle0 = __lsx_vld(row0 + 1, 0);
|
|
__m128i utf8_unpacked_lo = lasx_extracti128_lo(utf8_unpacked);
|
|
__m128i utf8_packed0 =
|
|
__lsx_vshuf_b(utf8_unpacked_lo, utf8_unpacked_lo, shuffle0);
|
|
__lsx_vst(utf8_packed0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
|
|
const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[(ascii_mask >> 8)]][0];
|
|
__m128i shuffle1 = __lsx_vld(row1 + 1, 0);
|
|
__m128i utf8_unpacked_hi = lasx_extracti128_hi(utf8_unpacked);
|
|
__m128i utf8_packed1 =
|
|
__lsx_vshuf_b(utf8_unpacked_hi, utf8_unpacked_hi, shuffle1);
|
|
__lsx_vst(utf8_packed1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
latin1_input += 16;
|
|
} // while
|
|
|
|
return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/lasx/lasx_convert_latin1_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lasx/lasx_convert_latin1_to_utf16.cpp */
|
|
std::pair<const char *, char16_t *>
|
|
lasx_convert_latin1_to_utf16le(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char *end = buf + len;
|
|
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)utf16_output & 0x1F) && buf < end) {
|
|
*utf16_output++ = uint8_t(*buf) & 0xFF;
|
|
buf++;
|
|
}
|
|
|
|
while (end - buf >= 32) {
|
|
__m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m256i inlow = __lasx_vext2xv_hu_bu(in8);
|
|
__m256i in8_high = __lasx_xvpermi_q(in8, in8, 0b00000001);
|
|
__m256i inhigh = __lasx_vext2xv_hu_bu(in8_high);
|
|
__lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
|
|
|
|
utf16_output += 32;
|
|
buf += 32;
|
|
}
|
|
|
|
if (end - buf >= 16) {
|
|
__m128i zero = __lsx_vldi(0);
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m128i inlow = __lsx_vilvl_b(zero, in8);
|
|
__m128i inhigh = __lsx_vilvh_b(zero, in8);
|
|
__lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
|
|
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
}
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
std::pair<const char *, char16_t *>
|
|
lasx_convert_latin1_to_utf16be(const char *buf, size_t len,
|
|
char16_t *utf16_output) {
|
|
const char *end = buf + len;
|
|
|
|
while (((uint64_t)utf16_output & 0x1F) && buf < end) {
|
|
*utf16_output++ = (uint16_t(*buf++) << 8);
|
|
}
|
|
|
|
__m256i zero = __lasx_xvldi(0);
|
|
while (end - buf >= 32) {
|
|
__m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m256i in8_shuf = __lasx_xvpermi_d(in8, 0b11011000);
|
|
|
|
__m256i inlow = __lasx_xvilvl_b(in8_shuf, zero);
|
|
__m256i inhigh = __lasx_xvilvh_b(in8_shuf, zero);
|
|
__lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
|
|
utf16_output += 32;
|
|
buf += 32;
|
|
}
|
|
|
|
if (end - buf >= 16) {
|
|
__m128i zero_128 = __lsx_vldi(0);
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m128i inlow = __lsx_vilvl_b(in8, zero_128);
|
|
__m128i inhigh = __lsx_vilvh_b(in8, zero_128);
|
|
__lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
/* end file src/lasx/lasx_convert_latin1_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lasx/lasx_convert_latin1_to_utf32.cpp */
|
|
std::pair<const char *, char32_t *>
|
|
lasx_convert_latin1_to_utf32(const char *buf, size_t len,
|
|
char32_t *utf32_output) {
|
|
const char *end = buf + len;
|
|
|
|
// LASX requires 32-byte alignment, otherwise performance will be degraded
|
|
while (((uint64_t)utf32_output & 0x1F) && buf < end) {
|
|
*utf32_output++ = ((uint32_t)*buf) & 0xFF;
|
|
buf++;
|
|
}
|
|
|
|
while (end - buf >= 32) {
|
|
__m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m256i in32_0 = __lasx_vext2xv_wu_bu(in8);
|
|
__lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
|
|
__m256i in8_1 = __lasx_xvpermi_d(in8, 0b00000001);
|
|
__m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
|
|
__lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 32);
|
|
|
|
__m256i in8_2 = __lasx_xvpermi_d(in8, 0b00000010);
|
|
__m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
|
|
__lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 64);
|
|
|
|
__m256i in8_3 = __lasx_xvpermi_d(in8, 0b00000011);
|
|
__m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
|
|
__lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 96);
|
|
|
|
utf32_output += 32;
|
|
buf += 32;
|
|
}
|
|
|
|
if (end - buf >= 16) {
|
|
__m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
|
|
|
|
__m128i zero = __lsx_vldi(0);
|
|
__m128i in16low = __lsx_vilvl_b(zero, in8);
|
|
__m128i in16high = __lsx_vilvh_b(zero, in8);
|
|
__m128i in32_0 = __lsx_vilvl_h(zero, in16low);
|
|
__m128i in32_1 = __lsx_vilvh_h(zero, in16low);
|
|
__m128i in32_2 = __lsx_vilvl_h(zero, in16high);
|
|
__m128i in32_3 = __lsx_vilvh_h(zero, in16high);
|
|
|
|
__lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
__lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
|
|
__lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
|
|
__lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
|
|
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
}
|
|
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
/* end file src/lasx/lasx_convert_latin1_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/lasx/lasx_convert_utf8_to_utf16.cpp */
|
|
// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 16, usually 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
|
|
// We first try a few fast paths.
|
|
// The obvious first test is ASCII, which actually consumes the full 16.
|
|
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (match_system(big_endian)) {
|
|
__lsx_vst(__lsx_vilvl_b(zero, in),
|
|
reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lsx_vst(__lsx_vilvh_b(zero, in),
|
|
reinterpret_cast<uint16_t *>(utf16_output), 16);
|
|
} else {
|
|
__lsx_vst(__lsx_vilvl_b(in, zero),
|
|
reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
__lsx_vst(__lsx_vilvh_b(in, zero),
|
|
reinterpret_cast<uint16_t *>(utf16_output), 16);
|
|
}
|
|
utf16_output += 16; // We wrote 16 16-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
|
|
// 3 byte sequences are the next most common, as seen in CJK, which has long
|
|
// sequences of these.
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
|
|
// UTF-16 code units.
|
|
__m128i composed = convert_utf8_3_byte_to_utf16(in);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 4; // We wrote 4 16-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
|
|
// 2 byte sequences occur in short bursts in languages like Greek and Russian.
|
|
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
|
|
// We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
|
|
// UTF-16 code units.
|
|
__m128i composed = convert_utf8_2_byte_to_utf16(in);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 8; // We wrote 6 16-bit characters.
|
|
return 16; // We consumed 12 bytes.
|
|
}
|
|
|
|
/// We do not have a fast path available, or the fast path is unimportant, so
|
|
/// we fallback.
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
const __m128i zero = __lsx_vldi(0);
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// Convert to UTF-16
|
|
__m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
// Store
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 6; // We wrote 6 16-bit characters.
|
|
return consumed;
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
// UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// XXX: depending on the system scalar instructions might be faster.
|
|
// 1 byte: 00000000 00000000 0ccccccc
|
|
// 2 byte: 00000000 110bbbbb 10cccccc
|
|
// 3 byte: 1110aaaa 10bbbbbb 10cccccc
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2 byte: xx0bbbbb x0cccccc
|
|
// 3 byte: xxbbbbbb x0cccccc
|
|
__m128i lowperm = __lsx_vpickev_h(perm, perm);
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 00000000 00000000
|
|
// 3 byte: 00000000 1110aaaa
|
|
__m128i highperm = __lsx_vpickod_h(perm, perm);
|
|
// 3 byte: aaaa0000 00000000
|
|
highperm = __lsx_vslli_h(highperm, 12);
|
|
// ASCII
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2+byte: 00000000 00cccccc
|
|
__m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: xx0bbbbb 00000000
|
|
// 3 byte: xxbbbbbb 00000000
|
|
__m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00));
|
|
// 1 byte: 00000000 0ccccccc
|
|
// 2 byte: 0010bbbb bbcccccc
|
|
// 3 byte: 0010bbbb bbcccccc
|
|
__m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
|
|
|
|
__m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
|
|
// aaaabbbb bbcccccc
|
|
composed = __lsx_vbitsel_v(highperm, composed, v0fff);
|
|
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 4; // We wrote 4 16-bit codepoints
|
|
return consumed;
|
|
} else if (idx < 209) {
|
|
// THREE (3) input code-code units
|
|
if (input_utf8_end_of_code_point_mask == 0x888) {
|
|
__m128i expected_mask =
|
|
(__m128i)v16u8{0xf8, 0xc0, 0xc0, 0xc0, 0xf8, 0xc0, 0xc0, 0xc0,
|
|
0xf8, 0xc0, 0xc0, 0xc0, 0x0, 0x0, 0x0, 0x0};
|
|
__m128i expected =
|
|
(__m128i)v16u8{0xf0, 0x80, 0x80, 0x80, 0xf0, 0x80, 0x80, 0x80,
|
|
0xf0, 0x80, 0x80, 0x80, 0x0, 0x0, 0x0, 0x0};
|
|
__m128i check = __lsx_vseq_b(__lsx_vand_v(in, expected_mask), expected);
|
|
if (__lsx_bz_b(check))
|
|
return 12;
|
|
// We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
|
|
// UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
|
|
// it is easier when we can assume they are all pairs. This version does
|
|
// not use the LUT, but 4 byte sequences are less common and the overhead
|
|
// of the extra memory access is less important than the early branch
|
|
// overhead in shorter sequences.
|
|
|
|
// Swap byte pairs
|
|
// 10dddddd 10cccccc|10bbbbbb 11110aaa
|
|
// 10cccccc 10dddddd|11110aaa 10bbbbbb
|
|
__m128i swap = lsx_swap_bytes(in);
|
|
// Shift left 2 bits
|
|
// cccccc00 dddddd00 xxxxxxxx bbbbbb00
|
|
__m128i shift = __lsx_vslli_b(swap, 2);
|
|
// Create a magic number containing the low 2 bits of the trail surrogate
|
|
// and all the corrections needed to create the pair. UTF-8 4b prefix =
|
|
// -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6)
|
|
// surrogate high = +0x0000|0xD800
|
|
// surrogate low = +0xDC00|0x0000
|
|
// -------------------------------
|
|
// = +0xDC00|0xE7C0
|
|
__m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
|
|
// Generate unadjusted trail surrogate minus lowest 2 bits
|
|
// xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
|
|
__m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000FF00));
|
|
// Insert low 2 bits of trail surrogate to magic number for later
|
|
// 11011100 00000000 11100111 110000cc
|
|
__m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
|
|
|
|
// Generate lead surrogate
|
|
// xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
|
|
// 000000cc ccdddddd|xxxxxxxx xxxxxxxx
|
|
__m128i lead = __lsx_vbitsel_v(
|
|
__lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
|
|
__lsx_vrepli_h(0x3f /* 0x003f*/));
|
|
|
|
// Blend pairs
|
|
// 000000cc ccdddddd|11110aaa bbbbbb00
|
|
__m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF));
|
|
|
|
// Add magic number to finish the result
|
|
// 110111CC CCDDDDDD|110110AA BBBBBBCC
|
|
__m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
composed = lsx_swap_bytes(composed);
|
|
}
|
|
__lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
|
|
utf16_output += 6; // We 3 32-bit surrogate pairs.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// 3 1-4 byte sequences
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// 1 byte: 00000000 00000000 00000000 0ddddddd
|
|
// 3 byte: 00000000 00000000 110ccccc 10dddddd
|
|
// 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
|
|
// 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
// added to fix issue https://github.com/simdutf/simdutf/issues/514
|
|
// We only want to write 2 * 16-bit code units when that is actually what we
|
|
// have. Unfortunately, we cannot trust the input. So it is possible to get
|
|
// 0xff as an input byte and it should not result in a surrogate pair. We
|
|
// need to check for that.
|
|
uint32_t permbuffer[4];
|
|
__lsx_vst(perm, permbuffer, 0);
|
|
// Mask the low and middle bytes
|
|
// 00000000 00000000 00000000 0ddddddd
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
|
|
// Because the surrogates need more work, the high surrogate is computed
|
|
// first.
|
|
__m128i middlehigh = __lsx_vslli_w(perm, 2);
|
|
// 00000000 00000000 00cccccc 00000000
|
|
__m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00));
|
|
// Start assembling the sequence. Since the 4th byte is in the same position
|
|
// as it would be in a surrogate and there is no dependency, shift left
|
|
// instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
|
|
// 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
|
|
__m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000));
|
|
// Top 16 bits contains the high ten bits of the surrogate pair before
|
|
// correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
|
|
// bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
|
|
__m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
|
|
__m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
|
|
// Combine the low 6 or 7 bits by a shift right accumulate
|
|
// 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
|
|
// 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
|
|
// correction
|
|
__m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
|
|
// After this is for surrogates
|
|
// Blend the low and high surrogates
|
|
// 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
|
|
__m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF));
|
|
// Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
|
|
// yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
|
|
// 11110aaa bbbbbbcc|000000cc ccdddddd
|
|
__m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
|
|
__m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
|
|
// Correct the remaining UTF-8 prefix, surrogate offset, and add the
|
|
// surrogate prefixes in one magic 16-bit addition. similar magic number but
|
|
// without the continue byte adjust and halfword swapped UTF-8 4b prefix =
|
|
// -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6)
|
|
// surrogate high = +0xD800|0x0000
|
|
// surrogate low = +0x0000|0xDC00
|
|
// -----------------------------------
|
|
// = +0xE7C0|0xDC00
|
|
__m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
|
|
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
|
|
__m128i surrogates = __lsx_vadd_w(masked_pair, magic);
|
|
// If the high bit is 1 (s32 less than zero), this needs a surrogate pair
|
|
__m128i is_pair = __lsx_vslt_w(perm, zero);
|
|
// Select either the 4 byte surrogate pair or the 2 byte solo codepoint
|
|
// 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
|
|
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
|
|
__m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
|
|
// Byte swap if necessary
|
|
if (!match_system(big_endian)) {
|
|
selected = lsx_swap_bytes(selected);
|
|
}
|
|
// Attempting to shuffle and store would be complex, just scalarize.
|
|
uint32_t buffer_tmp[4];
|
|
__lsx_vst(selected, buffer_tmp, 0);
|
|
// Test for the top bit of the surrogate mask. Remove due to issue 514
|
|
// const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
|
|
// 0x00800000;
|
|
for (size_t i = 0; i < 3; i++) {
|
|
// Surrogate
|
|
// Used to be if (buffer[i] & SURROGATE_MASK) {
|
|
// See discussion above.
|
|
// patch for issue https://github.com/simdutf/simdutf/issues/514
|
|
if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
|
|
utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
|
|
utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
return consumed;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
return 12;
|
|
}
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf8_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lasx/lasx_convert_utf8_to_utf32.cpp */
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_out) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xFFF;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
|
|
// We process in chunks of 16 bytes.
|
|
// use fast implementation in src/simdutf/arm64/simd.h
|
|
// Ideally the compiler can keep the tables in registers.
|
|
__m128i zero = __lsx_vldi(0);
|
|
__m128i in16low = __lsx_vilvl_b(zero, in);
|
|
__m128i in16high = __lsx_vilvh_b(zero, in);
|
|
__m128i in32_0 = __lsx_vilvl_h(zero, in16low);
|
|
__m128i in32_1 = __lsx_vilvh_h(zero, in16low);
|
|
__m128i in32_2 = __lsx_vilvl_h(zero, in16high);
|
|
__m128i in32_3 = __lsx_vilvh_h(zero, in16high);
|
|
|
|
__lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
__lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
|
|
__lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
|
|
__lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
|
|
|
|
utf32_output += 16; // We wrote 16 32-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
__m128i zero = __lsx_vldi(0);
|
|
if (input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
|
|
// UTF-32 code units. Convert to UTF-16
|
|
__m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
|
|
__m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
|
|
|
|
__lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
utf32_output += 4; // We wrote 4 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// 2 byte sequences occur in short bursts in languages like Greek and Russian.
|
|
if (input_utf8_end_of_code_point_mask == 0xaaa) {
|
|
// We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
|
|
// UTF-32 code units. Convert to UTF-16
|
|
__m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
|
|
|
|
__m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
|
|
__m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
|
|
|
|
__lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
__lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
|
|
utf32_output += 6;
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// Either no fast path or an unimportant fast path.
|
|
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
|
|
if (idx < 64) {
|
|
// SIX (6) input code-code units
|
|
// Convert to UTF-16
|
|
__m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
|
|
__m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
|
|
__m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
|
|
|
|
__lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
|
|
__lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
|
|
utf32_output += 6;
|
|
return consumed;
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-code units
|
|
// UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// Shuffle
|
|
// 1 byte: 00000000 00000000 0ccccccc
|
|
// 2 byte: 00000000 110bbbbb 10cccccc
|
|
// 3 byte: 1110aaaa 10bbbbbb 10cccccc
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
// Split
|
|
// 00000000 00000000 0ccccccc
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
|
|
// Note: unmasked
|
|
// xxxxxxxx aaaaxxxx xxxxxxxx
|
|
__m128i high =
|
|
__lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
|
|
// Use 16 bit bic instead of and.
|
|
// The top bits will be corrected later in the bsl
|
|
// 00000000 10bbbbbb 00000000
|
|
__m128i middle =
|
|
__lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits
|
|
// Combine low and middle with shift right accumulate
|
|
// 00000000 00xxbbbb bbcccccc
|
|
__m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
|
|
// Insert top 4 bits from high byte with bitwise select
|
|
// 00000000 aaaabbbb bbcccccc
|
|
__m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000));
|
|
__lsx_vst(composed, utf32_output, 0);
|
|
utf32_output += 4; // We wrote 4 32-bit characters.
|
|
return consumed;
|
|
} else if (idx < 209) {
|
|
// THREE (3) input code-code units
|
|
if (input_utf8_end_of_code_point_mask == 0x888) {
|
|
// We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
|
|
// UTF-32 code units. This uses the same method as the fixed 3 byte
|
|
// version, reversing and shift left insert. However, there is no need for
|
|
// a shuffle mask now, just rev16 and rev32.
|
|
//
|
|
// This version does not use the LUT, but 4 byte sequences are less common
|
|
// and the overhead of the extra memory access is less important than the
|
|
// early branch overhead in shorter sequences, so it comes last.
|
|
|
|
// Swap pairs of bytes
|
|
// 10dddddd|10cccccc|10bbbbbb|11110aaa
|
|
// 10cccccc 10dddddd|11110aaa 10bbbbbb
|
|
__m128i swap = lsx_swap_bytes(in);
|
|
// Shift left and insert
|
|
// xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
|
|
__m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
|
|
__lsx_vrepli_h(0x3f /*0x003F*/));
|
|
// Shift insert again
|
|
// xxxxxxxx xxxaaabb bbbbcccc ccdddddd
|
|
__m128i merge2 =
|
|
__lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
|
|
__lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
|
|
lsx_splat_u32(0x00000FFF));
|
|
// Clear the garbage
|
|
// 00000000 000aaabb bbbbcccc ccdddddd
|
|
__m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF));
|
|
// Store
|
|
__lsx_vst(composed, utf32_output, 0);
|
|
utf32_output += 3; // We wrote 3 32-bit characters.
|
|
return 12; // We consumed 12 bytes.
|
|
}
|
|
// Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
|
|
// due to surrogates no longer being involved.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// 1 byte: 00000000 00000000 00000000 0ddddddd
|
|
// 2 byte: 00000000 00000000 110ccccc 10dddddd
|
|
// 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
|
|
// 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(zero, in, sh);
|
|
|
|
// Ascii
|
|
__m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
|
|
__m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00));
|
|
// 00000000 00000000 0000cccc ccdddddd
|
|
__m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii);
|
|
|
|
__m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000));
|
|
__m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
|
|
// Insert twice
|
|
// 00000000 000aaabb bbbbxxxx xxxxxxxx
|
|
__m128i corrected_srli2 =
|
|
__lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
|
|
__m128i ab =
|
|
__lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
|
|
ab = __lsx_vsrli_w(ab, 4);
|
|
// 00000000 000aaabb bbbbcccc ccdddddd
|
|
__m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF));
|
|
// Store
|
|
__lsx_vst(composed, utf32_output, 0);
|
|
utf32_output += 3; // We wrote 3 32-bit characters.
|
|
return consumed;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
return 12;
|
|
}
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf8_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lasx/lasx_convert_utf8_to_latin1.cpp */
|
|
size_t convert_masked_utf8_to_latin1(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char *&latin1_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
|
|
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
// Optimization note: our main path below is load-latency dependent. Thus it
|
|
// is maybe beneficial to have fast paths that depend on branch prediction but
|
|
// have less latency. This results in more instructions but, potentially, also
|
|
// higher speeds.
|
|
|
|
// We first try a few fast paths.
|
|
// The obvious first test is ASCII, which actually consumes the full 16.
|
|
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
|
|
// We process in chunks of 16 bytes
|
|
__lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
latin1_output += 16; // We wrote 16 18-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
/// We do not have a fast path available, or the fast path is unimportant, so
|
|
/// we fallback.
|
|
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][0];
|
|
|
|
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
|
|
[input_utf8_end_of_code_point_mask][1];
|
|
// this indicates an invalid input:
|
|
if (idx >= 64) {
|
|
return consumed;
|
|
}
|
|
// Here we should have (idx < 64), if not, there is a bug in the validation or
|
|
// elsewhere. SIX (6) input code-code units this is a relatively easy scenario
|
|
// we process SIX (6) input code-code units. The max length in bytes of six
|
|
// code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
|
|
// 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
|
|
// scenario we process SIX (6) input code-code units. The max length in bytes
|
|
// of six code code units spanning between 1 and 2 bytes each is 12 bytes.
|
|
__m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
|
|
simdutf::tables::utf8_to_utf16::shufutf8[idx]),
|
|
0);
|
|
// Shuffle
|
|
// 1 byte: 00000000 0bbbbbbb
|
|
// 2 byte: 110aaaaa 10bbbbbb
|
|
sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
|
|
__m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
|
|
// ascii mask
|
|
// 1 byte: 11111111 11111111
|
|
// 2 byte: 00000000 00000000
|
|
__m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
|
|
// utf8 mask
|
|
// 1 byte: 00000000 00000000
|
|
// 2 byte: 00111111 00111111
|
|
__m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
|
|
__lsx_vldi(0b00111111));
|
|
// mask
|
|
// 1 byte: 11111111 11111111
|
|
// 2 byte: 00111111 00111111
|
|
__m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
|
|
|
|
__m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
|
|
// writing 8 bytes even though we only care about the first 6 bytes.
|
|
__m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
|
|
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
latin1_output += 6; // We wrote 6 bytes.
|
|
return consumed;
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf8_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lasx/lasx_convert_utf16_to_latin1.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
lasx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 16) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
in1 = lsx_swap_bytes(in1);
|
|
}
|
|
if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
|
|
// 1. pack the bytes
|
|
__m128i latin1_packed = __lsx_vpickev_b(in1, in);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
latin1_output += 16;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
lasx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
while (end - buf >= 16) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
__m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
|
|
if (!match_system(big_endian)) {
|
|
in = lsx_swap_bytes(in);
|
|
in1 = lsx_swap_bytes(in1);
|
|
}
|
|
if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
|
|
// 1. pack the bytes
|
|
__m128i latin1_packed = __lsx_vpickev_b(in1, in);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
latin1_output += 16;
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
for (int k = 0; k < 16; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf16_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/lasx/lasx_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single LASX register i.e., it
|
|
loads eight 16-bit code units.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit code unit
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole LASX register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit code units, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two LASX registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char *>
|
|
lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char16_t *end = buf + len;
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
__m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
|
|
__m256i zero = __lasx_xvldi(0);
|
|
__m128i zero_128 = __lsx_vldi(0);
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lasx_swap_bytes(in);
|
|
}
|
|
if (__lasx_xbnz_h(__lasx_xvslt_hu(
|
|
in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
__m256i utf8_packed =
|
|
__lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
|
|
// 2. store (16 bytes)
|
|
__lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 16
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
__m256i t0 = __lasx_xvslli_h(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
__m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
__m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
__m256i t3 = __lasx_xvor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
__m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
|
|
__m256i t4 = __lasx_xvor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m256i one_byte_bytemask =
|
|
__lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
|
|
__m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
__m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
|
|
uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
|
|
uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
|
|
// 4. pack the bytes
|
|
const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m1]][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_packed1 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
|
|
|
|
const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_packed2 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
__lsx_vst(utf8_packed2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
__m256i surrogates_bytemask = __lasx_xvseq_h(
|
|
__lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (__lasx_xbz_v(surrogates_bytemask)) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m256i t0 = __lasx_xvpickev_b(in, in);
|
|
t0 = __lasx_xvilvl_b(t0, t0);
|
|
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
|
|
__m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m256i t1 = __lasx_xvand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m256i s0 = __lasx_xvsrli_h(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m256i s1 = __lasx_xvslli_h(in, 2);
|
|
// s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
|
|
s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00));
|
|
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m256i s2 = __lasx_xvor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m256i s3 = __lasx_xvor_v(s2, v_c0e0);
|
|
__m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
|
|
__m256i m0 =
|
|
__lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
|
|
__m256i s4 = __lasx_xvxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m256i out0 = __lasx_xvilvl_h(s4, t2);
|
|
__m256i out1 = __lasx_xvilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
|
|
__m256i one_byte_bytemask_low =
|
|
__lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m256i one_byte_bytemask_high =
|
|
__lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
__m256i one_or_two_bytes_bytemask_low =
|
|
__lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m256i one_or_two_bytes_bytemask_high =
|
|
__lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m256i mask0 = __lasx_xvmskltz_h(
|
|
__lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
|
|
__m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
|
|
one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
|
|
|
|
uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 0);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask0, 4);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_2 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
|
|
__lsx_vst(utf8_2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 4);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle3 = __lsx_vld(row3, 1);
|
|
__m128i utf8_3 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
|
|
__lsx_vst(utf8_3, utf8_output, 0);
|
|
utf8_output += row3[0];
|
|
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char *>
|
|
lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
__m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
|
|
__m256i zero = __lasx_xvldi(0);
|
|
__m128i zero_128 = __lsx_vldi(0);
|
|
while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lasx_swap_bytes(in);
|
|
}
|
|
if (__lasx_xbnz_h(__lasx_xvslt_hu(
|
|
in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
__m256i utf8_packed =
|
|
__lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
|
|
// 2. store (16 bytes)
|
|
__lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 16
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
__m256i t0 = __lasx_xvslli_h(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
__m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
__m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
__m256i t3 = __lasx_xvor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
__m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
|
|
__m256i t4 = __lasx_xvor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m256i one_byte_bytemask =
|
|
__lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
|
|
__m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
__m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
|
|
uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
|
|
uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
|
|
// 4. pack the bytes
|
|
const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m1]][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_packed1 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
|
|
|
|
const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_packed2 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
__lsx_vst(utf8_packed2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
__m256i surrogates_bytemask = __lasx_xvseq_h(
|
|
__lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (__lasx_xbz_v(surrogates_bytemask)) {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m256i t0 = __lasx_xvpickev_b(in, in);
|
|
t0 = __lasx_xvilvl_b(t0, t0);
|
|
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
|
|
__m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m256i t1 = __lasx_xvand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m256i s0 = __lasx_xvsrli_h(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m256i s1 = __lasx_xvslli_h(in, 2);
|
|
// s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
|
|
s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00));
|
|
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m256i s2 = __lasx_xvor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m256i s3 = __lasx_xvor_v(s2, v_c0e0);
|
|
__m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
|
|
__m256i m0 =
|
|
__lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
|
|
__m256i s4 = __lasx_xvxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m256i out0 = __lasx_xvilvl_h(s4, t2);
|
|
__m256i out1 = __lasx_xvilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
|
|
__m256i one_byte_bytemask_low =
|
|
__lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m256i one_byte_bytemask_high =
|
|
__lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
__m256i one_or_two_bytes_bytemask_low =
|
|
__lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m256i one_or_two_bytes_bytemask_high =
|
|
__lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m256i mask0 = __lasx_xvmskltz_h(
|
|
__lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
|
|
__m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
|
|
one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
|
|
|
|
uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 0);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask0, 4);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_2 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
|
|
__lsx_vst(utf8_2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 4);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle3 = __lsx_vld(row3, 1);
|
|
__m128i utf8_3 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
|
|
__lsx_vst(utf8_3, utf8_output, 0);
|
|
utf8_output += row3[0];
|
|
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xF800) != 0xD800) {
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf16_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lasx/lasx_convert_utf16_to_utf32.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t *, char32_t *>
|
|
lasx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
|
|
char32_t *utf32_out) {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
const char16_t *end = buf + len;
|
|
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)utf32_output & 0x1f) && buf < end) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[0]) : buf[0];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
buf++;
|
|
} else {
|
|
if (buf + 1 >= end) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[1]) : buf[1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
buf += 2;
|
|
}
|
|
}
|
|
|
|
__m256i v_f800 = lasx_splat_u16(0xf800);
|
|
__m256i v_d800 = lasx_splat_u16(0xd800);
|
|
|
|
while (end - buf >= 16) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lasx_swap_bytes(in);
|
|
}
|
|
|
|
__m256i surrogates_bytemask =
|
|
__lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (__lasx_xbz_v(surrogates_bytemask)) {
|
|
// case: no surrogate pairs, extend all 16-bit code units to 32-bit code
|
|
// units
|
|
__m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
|
|
__lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
|
|
__lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the
|
|
error. Otherwise, it is the position of the first unprocessed byte in buf
|
|
(even if finished). A scalar routing should carry on the conversion of the
|
|
tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t *>
|
|
lasx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
|
|
char32_t *utf32_out) {
|
|
uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
const char16_t *start = buf;
|
|
const char16_t *end = buf + len;
|
|
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)utf32_output & 0x1f) && buf < end) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[0]) : buf[0];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
buf++;
|
|
} else if (buf + 1 < end) {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[1]) : buf[1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
buf += 2;
|
|
} else {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
}
|
|
|
|
__m256i v_f800 = lasx_splat_u16(0xf800);
|
|
__m256i v_d800 = lasx_splat_u16(0xd800);
|
|
while (end - buf >= 16) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
|
|
if (!match_system(big_endian)) {
|
|
in = lasx_swap_bytes(in);
|
|
}
|
|
|
|
__m256i surrogates_bytemask =
|
|
__lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help.
|
|
// However, it is likely an uncommon occurrence.
|
|
if (__lasx_xbz_v(surrogates_bytemask)) {
|
|
// case: no surrogate pairs, extend all 16-bit code units to 32-bit code
|
|
// units
|
|
__m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
|
|
__lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
|
|
__lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint16_t word =
|
|
!match_system(big_endian) ? scalar::u16_swap_bytes(buf[k]) : buf[k];
|
|
if ((word & 0xF800) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian)
|
|
? scalar::u16_swap_bytes(buf[k + 1])
|
|
: buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if ((diff | diff2) > 0x3FF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k - 1),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char32_t *>(utf32_output));
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf16_to_utf32.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
/* begin file src/lasx/lasx_convert_utf32_to_latin1.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
lasx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *end = buf + len;
|
|
const __m256i shuf_mask = ____m256i(
|
|
(__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
|
|
__m256i v_ff = __lasx_xvrepli_w(0xFF);
|
|
|
|
while (end - buf >= 16) {
|
|
__m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
|
|
|
|
__m256i in12 = __lasx_xvor_v(in1, in2);
|
|
if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
|
|
// 1. pack the bytes
|
|
__m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
|
|
latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
|
|
__m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
|
|
latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
latin1_output += 16;
|
|
} else {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, latin1_output);
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
lasx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
|
|
char *latin1_output) {
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
const __m256i shuf_mask = ____m256i(
|
|
(__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
|
|
__m256i v_ff = __lasx_xvrepli_w(0xFF);
|
|
|
|
while (end - buf >= 16) {
|
|
__m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
|
|
|
|
__m256i in12 = __lasx_xvor_v(in1, in2);
|
|
if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
|
|
// 1. pack the bytes
|
|
__m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
|
|
latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
|
|
__m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
|
|
latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
latin1_output += 16;
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
for (int k = 0; k < 16; k++) {
|
|
uint32_t word = buf[k];
|
|
if (word <= 0xff) {
|
|
*latin1_output++ = char(word);
|
|
} else {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
|
|
latin1_output);
|
|
}
|
|
}
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
latin1_output);
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf32_to_latin1.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lasx/lasx_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t *, char *>
|
|
lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char32_t *end = buf + len;
|
|
|
|
// load addr align 32
|
|
while (((uint64_t)buf & 0x1F) && buf < end) {
|
|
uint32_t word = *buf;
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
buf++;
|
|
}
|
|
|
|
__m256i v_c080 = lasx_splat_u16(0xc080);
|
|
__m256i v_07ff = lasx_splat_u16(0x07ff);
|
|
__m256i v_dfff = lasx_splat_u16(0xdfff);
|
|
__m256i v_d800 = lasx_splat_u16(0xd800);
|
|
__m256i zero = __lasx_xvldi(0);
|
|
__m128i zero_128 = __lsx_vldi(0);
|
|
__m256i forbidden_bytemask = __lasx_xvldi(0x0);
|
|
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
|
|
|
|
// Check if no bits set above 16th
|
|
if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
|
|
__m256i utf16_packed =
|
|
__lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
|
|
|
|
if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
|
|
utf16_packed))) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m256i utf8_packed = __lasx_xvpermi_d(
|
|
__lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = __lasx_xvor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = __lasx_xvor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m256i one_byte_bytemask =
|
|
__lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
|
|
__m256i utf8_unpacked =
|
|
__lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
__m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
|
|
uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
|
|
uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
|
|
// 4. pack the bytes
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m1]][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_packed1 = __lsx_vshuf_b(
|
|
zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
|
|
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_packed2 = __lsx_vshuf_b(
|
|
zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
__lsx_vst(utf8_packed2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
buf += 16;
|
|
continue;
|
|
} else {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
forbidden_bytemask = __lasx_xvor_v(
|
|
__lasx_xvand_v(
|
|
__lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 &
|
|
#3 in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
|
|
t0 = __lasx_xvilvl_b(t0, t0);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
__m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m256i t1 = __lasx_xvand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00));
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m256i s2 = __lasx_xvor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m256i s3 = __lasx_xvor_v(s2, v_c0e0);
|
|
// __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
__m256i one_or_two_bytes_bytemask =
|
|
__lasx_xvsle_hu(utf16_packed, v_07ff);
|
|
__m256i m0 =
|
|
__lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
|
|
__m256i s4 = __lasx_xvxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m256i out0 = __lasx_xvilvl_h(s4, t2);
|
|
__m256i out1 = __lasx_xvilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m256i one_byte_bytemask =
|
|
__lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
|
|
|
|
__m256i one_or_two_bytes_bytemask_u16_to_u32_low =
|
|
__lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m256i one_or_two_bytes_bytemask_u16_to_u32_high =
|
|
__lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m256i one_byte_bytemask_u16_to_u32_low =
|
|
__lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m256i one_byte_bytemask_u16_to_u32_high =
|
|
__lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
__m256i mask0 = __lasx_xvmskltz_h(
|
|
__lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
|
|
one_byte_bytemask_u16_to_u32_low));
|
|
__m256i mask1 = __lasx_xvmskltz_h(
|
|
__lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
|
|
one_byte_bytemask_u16_to_u32_high));
|
|
|
|
uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 0);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask0, 4);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_2 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
|
|
__lsx_vst(utf8_2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 4);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle3 = __lsx_vld(row3, 1);
|
|
__m128i utf8_3 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
|
|
__lsx_vst(utf8_3, utf8_output, 0);
|
|
utf8_output += row3[0];
|
|
|
|
buf += 16;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
if (__lasx_xbnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
|
|
std::pair<result, char *>
|
|
lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
|
|
char *utf8_out) {
|
|
uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
// load addr align 32
|
|
while (((uint64_t)buf & 0x1F) && buf < end) {
|
|
uint32_t word = *buf;
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
buf++;
|
|
}
|
|
|
|
__m256i v_c080 = lasx_splat_u16(0xc080);
|
|
__m256i v_07ff = lasx_splat_u16(0x07ff);
|
|
__m256i v_dfff = lasx_splat_u16(0xdfff);
|
|
__m256i v_d800 = lasx_splat_u16(0xd800);
|
|
__m256i zero = __lasx_xvldi(0);
|
|
__m128i zero_128 = __lsx_vldi(0);
|
|
__m256i forbidden_bytemask = __lasx_xvldi(0x0);
|
|
const size_t safety_margin =
|
|
12; // to avoid overruns, see issue
|
|
// https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
|
|
|
|
// Check if no bits set above 16th
|
|
if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
|
|
__m256i utf16_packed =
|
|
__lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
|
|
|
|
if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
|
|
utf16_packed))) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
__m256i utf8_packed = __lasx_xvpermi_d(
|
|
__lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
|
|
// 2. store (8 bytes)
|
|
__lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = __lasx_xvor_v(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = __lasx_xvor_v(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
__m256i one_byte_bytemask =
|
|
__lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
|
|
__m256i utf8_unpacked =
|
|
__lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
__m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
|
|
uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
|
|
uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
|
|
// 4. pack the bytes
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m1]][0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_packed1 = __lsx_vshuf_b(
|
|
zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
|
|
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
|
|
[lasx_1_2_utf8_bytes_mask[m2]][0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_packed2 = __lsx_vshuf_b(
|
|
zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
|
|
// 5. store bytes
|
|
__lsx_vst(utf8_packed1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
__lsx_vst(utf8_packed2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
buf += 16;
|
|
continue;
|
|
} else {
|
|
// case: code units from register produce either 1, 2 or 3 UTF-8 bytes
|
|
forbidden_bytemask = __lasx_xvor_v(
|
|
__lasx_xvand_v(
|
|
__lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
if (__lasx_xbnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
|
|
single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
|
|
two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
|
|
three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two code units (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 &
|
|
#3 in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** --
|
|
precompute either byte 1 for case #2 or byte 2 for case #3. Note that
|
|
they differ by exactly one bit.
|
|
|
|
Finally from these two code units we build proper UTF-8 sequence,
|
|
taking into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
__m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
|
|
t0 = __lasx_xvilvl_b(t0, t0);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
__m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
|
|
__m256i t1 = __lasx_xvand_v(t0, v_3f7f);
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
__m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
__m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
__m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3F00));
|
|
// [00bb|bbbb|0000|aaaa]
|
|
__m256i s2 = __lasx_xvor_v(s0, s1);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
__m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
|
|
__m256i s3 = __lasx_xvor_v(s2, v_c0e0);
|
|
// __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
__m256i one_or_two_bytes_bytemask =
|
|
__lasx_xvsle_hu(utf16_packed, v_07ff);
|
|
__m256i m0 =
|
|
__lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
|
|
__m256i s4 = __lasx_xvxor_v(s3, m0);
|
|
|
|
// 4. expand code units 16-bit => 32-bit
|
|
__m256i out0 = __lasx_xvilvl_h(s4, t2);
|
|
__m256i out1 = __lasx_xvilvh_h(s4, t2);
|
|
|
|
// 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
|
|
__m256i one_byte_bytemask =
|
|
__lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
|
|
|
|
__m256i one_or_two_bytes_bytemask_u16_to_u32_low =
|
|
__lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
|
|
__m256i one_or_two_bytes_bytemask_u16_to_u32_high =
|
|
__lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
|
|
|
|
__m256i one_byte_bytemask_u16_to_u32_low =
|
|
__lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
|
|
__m256i one_byte_bytemask_u16_to_u32_high =
|
|
__lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
|
|
|
|
__m256i mask0 = __lasx_xvmskltz_h(
|
|
__lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
|
|
one_byte_bytemask_u16_to_u32_low));
|
|
__m256i mask1 = __lasx_xvmskltz_h(
|
|
__lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
|
|
one_byte_bytemask_u16_to_u32_high));
|
|
|
|
uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
|
|
const uint8_t *row0 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle0 = __lsx_vld(row0, 1);
|
|
__m128i utf8_0 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
|
|
__lsx_vst(utf8_0, utf8_output, 0);
|
|
utf8_output += row0[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 0);
|
|
const uint8_t *row1 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle1 = __lsx_vld(row1, 1);
|
|
__m128i utf8_1 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
|
|
__lsx_vst(utf8_1, utf8_output, 0);
|
|
utf8_output += row1[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask0, 4);
|
|
const uint8_t *row2 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle2 = __lsx_vld(row2, 1);
|
|
__m128i utf8_2 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
|
|
__lsx_vst(utf8_2, utf8_output, 0);
|
|
utf8_output += row2[0];
|
|
|
|
mask = __lasx_xvpickve2gr_wu(mask1, 4);
|
|
const uint8_t *row3 =
|
|
&simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
|
|
[0];
|
|
__m128i shuffle3 = __lsx_vld(row3, 1);
|
|
__m128i utf8_3 =
|
|
__lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
|
|
__lsx_vst(utf8_3, utf8_output, 0);
|
|
utf8_output += row3[0];
|
|
|
|
buf += 16;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
|
|
// will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFFFF80) == 0) {
|
|
*utf8_output++ = char(word);
|
|
} else if ((word & 0xFFFFF800) == 0) {
|
|
*utf8_output++ = char((word >> 6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if ((word & 0xFFFF0000) == 0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 12) | 0b11100000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
*utf8_output++ = char((word >> 18) | 0b11110000);
|
|
*utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char *>(utf8_output));
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf32_to_utf8.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/lasx/lasx_convert_utf32_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t *, char16_t *>
|
|
lasx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
|
|
char16_t *utf16_out) {
|
|
uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
|
|
const char32_t *end = buf + len;
|
|
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)utf16_output & 0x1F) && buf < end) {
|
|
uint32_t word = *buf++;
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(word >> 8 | word << 8)
|
|
: char16_t(word);
|
|
// buf++;
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
// buf++;
|
|
}
|
|
}
|
|
|
|
__m256i forbidden_bytemask = __lasx_xvrepli_h(0);
|
|
__m256i v_d800 = lasx_splat_u16(0xd800);
|
|
__m256i v_dfff = lasx_splat_u16(0xdfff);
|
|
while (end - buf >= 16) {
|
|
__m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
|
|
|
|
// Check if no bits set above 16th
|
|
if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
|
|
__m256i utf16_packed =
|
|
__lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
|
|
forbidden_bytemask = __lasx_xvor_v(
|
|
__lasx_xvand_v(
|
|
__lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
|
|
if (!match_system(big_endian)) {
|
|
utf16_packed = lasx_swap_bytes(utf16_packed);
|
|
}
|
|
__lasx_xvst(utf16_packed, utf16_output, 0);
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
} else {
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(word >> 8 | word << 8)
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(nullptr,
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate =
|
|
uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (__lasx_xbnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t *>
|
|
lasx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
|
|
char16_t *utf16_out) {
|
|
uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
|
|
const char32_t *start = buf;
|
|
const char32_t *end = buf + len;
|
|
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)utf16_output & 0x1F) && buf < end) {
|
|
uint32_t word = *buf++;
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start - 1),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(word >> 8 | word << 8)
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start - 1),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
|
|
__m256i forbidden_bytemask = __lasx_xvrepli_h(0);
|
|
__m256i v_d800 = lasx_splat_u16(0xd800);
|
|
__m256i v_dfff = lasx_splat_u16(0xdfff);
|
|
while (end - buf >= 16) {
|
|
__m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
|
|
__m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
|
|
|
|
// Check if no bits set above 16th
|
|
if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
|
|
__m256i utf16_packed =
|
|
__lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
|
|
forbidden_bytemask = __lasx_xvor_v(
|
|
__lasx_xvand_v(
|
|
__lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
|
|
__lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
|
|
forbidden_bytemask);
|
|
if (__lasx_xbnz_v(forbidden_bytemask)) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
|
|
if (!match_system(big_endian)) {
|
|
utf16_packed = lasx_swap_bytes(utf16_packed);
|
|
}
|
|
|
|
__lasx_xvst(utf16_packed, utf16_output, 0);
|
|
utf16_output += 16;
|
|
buf += 16;
|
|
} else {
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if (size_t(end - buf) < forward + 1) {
|
|
forward = size_t(end - buf - 1);
|
|
}
|
|
for (; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if ((word & 0xFFFF0000) == 0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) {
|
|
return std::make_pair(
|
|
result(error_code::SURROGATE, buf - start + k),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
*utf16_output++ = !match_system(big_endian)
|
|
? char16_t(word >> 8 | word << 8)
|
|
: char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) {
|
|
return std::make_pair(
|
|
result(error_code::TOO_LARGE, buf - start + k),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate =
|
|
uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start),
|
|
reinterpret_cast<char16_t *>(utf16_output));
|
|
}
|
|
/* end file src/lasx/lasx_convert_utf32_to_utf16.cpp */
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
/* begin file src/lasx/lasx_base64.cpp */
|
|
/**
|
|
* References and further reading:
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
|
|
* speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
|
|
* https://arxiv.org/abs/1910.05109
|
|
*
|
|
* Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
|
|
* Instructions, ACM Transactions on the Web 12 (3), 2018.
|
|
* https://arxiv.org/abs/1704.00605
|
|
*
|
|
* Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
|
|
* https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
|
|
* Request for Comments: 4648.
|
|
*
|
|
* Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
|
|
* http://www.alfredklomp.com/programming/sse-base64/. (2014).
|
|
*
|
|
* Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
|
|
* acceleration. https://github.com/aklomp/base64. (2014).
|
|
*
|
|
* Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
|
|
* https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
|
|
*
|
|
* Nick Kopp. 2013. Base64 Encoding on a GPU.
|
|
* https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
|
|
*/
|
|
|
|
template <bool isbase64url>
|
|
size_t encode_base64(char *dst, const char *src, size_t srclen,
|
|
base64_options options) {
|
|
// credit: Wojciech Muła
|
|
// SSE (lookup: pshufb improved unrolled)
|
|
const uint8_t *input = (const uint8_t *)src;
|
|
static const char *lookup_tbl =
|
|
isbase64url
|
|
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
|
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
uint8_t *out = (uint8_t *)dst;
|
|
|
|
v32u8 shuf;
|
|
__m256i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
|
|
base64_tbl2, base64_tbl3;
|
|
if (srclen >= 28) {
|
|
shuf = v32u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
|
|
1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
|
|
|
|
v_fc0fc00 = __lasx_xvreplgr2vr_w(uint32_t(0x0fc0fc00));
|
|
v_3f03f0 = __lasx_xvreplgr2vr_w(uint32_t(0x003f03f0));
|
|
shift_r = __lasx_xvreplgr2vr_w(uint32_t(0x0006000a));
|
|
shift_l = __lasx_xvreplgr2vr_w(uint32_t(0x00080004));
|
|
base64_tbl0 = ____m256i(__lsx_vld(lookup_tbl, 0));
|
|
base64_tbl1 = ____m256i(__lsx_vld(lookup_tbl, 16));
|
|
base64_tbl2 = ____m256i(__lsx_vld(lookup_tbl, 32));
|
|
base64_tbl3 = ____m256i(__lsx_vld(lookup_tbl, 48));
|
|
}
|
|
size_t i = 0;
|
|
for (; i + 100 <= srclen; i += 96) {
|
|
__m128i in0_lo =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
|
|
__m128i in0_hi =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
|
|
__m128i in1_lo =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
|
|
__m128i in1_hi =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
|
|
__m128i in2_lo =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 4);
|
|
__m128i in2_hi =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 5);
|
|
__m128i in3_lo =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 6);
|
|
__m128i in3_hi =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 7);
|
|
|
|
__m256i in0 = lasx_set_q(in0_hi, in0_lo);
|
|
__m256i in1 = lasx_set_q(in1_hi, in1_lo);
|
|
__m256i in2 = lasx_set_q(in2_hi, in2_lo);
|
|
__m256i in3 = lasx_set_q(in3_hi, in3_lo);
|
|
|
|
in0 = __lasx_xvshuf_b(in0, in0, (__m256i)shuf);
|
|
in1 = __lasx_xvshuf_b(in1, in1, (__m256i)shuf);
|
|
in2 = __lasx_xvshuf_b(in2, in2, (__m256i)shuf);
|
|
in3 = __lasx_xvshuf_b(in3, in3, (__m256i)shuf);
|
|
|
|
__m256i t0_0 = __lasx_xvand_v(in0, v_fc0fc00);
|
|
__m256i t0_1 = __lasx_xvand_v(in1, v_fc0fc00);
|
|
__m256i t0_2 = __lasx_xvand_v(in2, v_fc0fc00);
|
|
__m256i t0_3 = __lasx_xvand_v(in3, v_fc0fc00);
|
|
|
|
__m256i t1_0 = __lasx_xvsrl_h(t0_0, shift_r);
|
|
__m256i t1_1 = __lasx_xvsrl_h(t0_1, shift_r);
|
|
__m256i t1_2 = __lasx_xvsrl_h(t0_2, shift_r);
|
|
__m256i t1_3 = __lasx_xvsrl_h(t0_3, shift_r);
|
|
|
|
__m256i t2_0 = __lasx_xvand_v(in0, v_3f03f0);
|
|
__m256i t2_1 = __lasx_xvand_v(in1, v_3f03f0);
|
|
__m256i t2_2 = __lasx_xvand_v(in2, v_3f03f0);
|
|
__m256i t2_3 = __lasx_xvand_v(in3, v_3f03f0);
|
|
|
|
__m256i t3_0 = __lasx_xvsll_h(t2_0, shift_l);
|
|
__m256i t3_1 = __lasx_xvsll_h(t2_1, shift_l);
|
|
__m256i t3_2 = __lasx_xvsll_h(t2_2, shift_l);
|
|
__m256i t3_3 = __lasx_xvsll_h(t2_3, shift_l);
|
|
|
|
__m256i input0 = __lasx_xvor_v(t1_0, t3_0);
|
|
__m256i input0_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input0);
|
|
__m256i input0_shuf1 = __lasx_xvshuf_b(
|
|
base64_tbl3, base64_tbl2, __lasx_xvsub_b(input0, __lasx_xvldi(32)));
|
|
__m256i input0_mask = __lasx_xvslei_bu(input0, 31);
|
|
__m256i input0_result =
|
|
__lasx_xvbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
|
|
__lasx_xvst(input0_result, reinterpret_cast<__m256i *>(out), 0);
|
|
out += 32;
|
|
|
|
__m256i input1 = __lasx_xvor_v(t1_1, t3_1);
|
|
__m256i input1_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input1);
|
|
__m256i input1_shuf1 = __lasx_xvshuf_b(
|
|
base64_tbl3, base64_tbl2, __lasx_xvsub_b(input1, __lasx_xvldi(32)));
|
|
__m256i input1_mask = __lasx_xvslei_bu(input1, 31);
|
|
__m256i input1_result =
|
|
__lasx_xvbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
|
|
__lasx_xvst(input1_result, reinterpret_cast<__m256i *>(out), 0);
|
|
out += 32;
|
|
|
|
__m256i input2 = __lasx_xvor_v(t1_2, t3_2);
|
|
__m256i input2_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input2);
|
|
__m256i input2_shuf1 = __lasx_xvshuf_b(
|
|
base64_tbl3, base64_tbl2, __lasx_xvsub_b(input2, __lasx_xvldi(32)));
|
|
__m256i input2_mask = __lasx_xvslei_bu(input2, 31);
|
|
__m256i input2_result =
|
|
__lasx_xvbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
|
|
__lasx_xvst(input2_result, reinterpret_cast<__m256i *>(out), 0);
|
|
out += 32;
|
|
|
|
__m256i input3 = __lasx_xvor_v(t1_3, t3_3);
|
|
__m256i input3_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input3);
|
|
__m256i input3_shuf1 = __lasx_xvshuf_b(
|
|
base64_tbl3, base64_tbl2, __lasx_xvsub_b(input3, __lasx_xvldi(32)));
|
|
__m256i input3_mask = __lasx_xvslei_bu(input3, 31);
|
|
__m256i input3_result =
|
|
__lasx_xvbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
|
|
__lasx_xvst(input3_result, reinterpret_cast<__m256i *>(out), 0);
|
|
out += 32;
|
|
}
|
|
for (; i + 28 <= srclen; i += 24) {
|
|
|
|
__m128i in_lo = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
|
|
__m128i in_hi =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
|
|
|
|
__m256i in = lasx_set_q(in_hi, in_lo);
|
|
|
|
// bytes from groups A, B and C are needed in separate 32-bit lanes
|
|
// in = [DDDD|CCCC|BBBB|AAAA]
|
|
//
|
|
// an input triplet has layout
|
|
// [????????|ccdddddd|bbbbcccc|aaaaaabb]
|
|
// byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next
|
|
// triplet
|
|
//
|
|
// shuffling changes the order of bytes: 1, 0, 2, 1
|
|
// [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
|
|
// ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
|
|
// processed bits
|
|
in = __lasx_xvshuf_b(in, in, (__m256i)shuf);
|
|
|
|
// unpacking
|
|
// t0 = [0000cccc|cc000000|aaaaaa00|00000000]
|
|
__m256i t0 = __lasx_xvand_v(in, v_fc0fc00);
|
|
// t1 = [00000000|00cccccc|00000000|00aaaaaa]
|
|
// ((c >> 6), (a >> 10))
|
|
__m256i t1 = __lasx_xvsrl_h(t0, shift_r);
|
|
|
|
// t2 = [00000000|00dddddd|000000bb|bbbb0000]
|
|
__m256i t2 = __lasx_xvand_v(in, v_3f03f0);
|
|
// t3 = [00dddddd|00000000|00bbbbbb|00000000]
|
|
// ((d << 8), (b << 4))
|
|
__m256i t3 = __lasx_xvsll_h(t2, shift_l);
|
|
|
|
// res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
|
|
__m256i indices = __lasx_xvor_v(t1, t3);
|
|
|
|
__m256i indices_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, indices);
|
|
__m256i indices_shuf1 = __lasx_xvshuf_b(
|
|
base64_tbl3, base64_tbl2, __lasx_xvsub_b(indices, __lasx_xvldi(32)));
|
|
__m256i indices_mask = __lasx_xvslei_bu(indices, 31);
|
|
__m256i indices_result =
|
|
__lasx_xvbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
|
|
__lasx_xvst(indices_result, reinterpret_cast<__m256i *>(out), 0);
|
|
out += 32;
|
|
}
|
|
|
|
return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
|
|
srclen - i, options);
|
|
}
|
|
|
|
static inline void compress(__m128i data, uint16_t mask, char *output) {
|
|
if (mask == 0) {
|
|
__lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
|
|
return;
|
|
}
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
|
|
v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
|
|
tables::base64::thintable_epi8[mask2]};
|
|
|
|
// we increment by 0x08 the second half of the mask
|
|
const v4u32 hi = {0, 0, 0x08080808, 0x08080808};
|
|
__m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
|
|
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = tables::base64::BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
__lsx_vld(reinterpret_cast<const __m128i *>(
|
|
tables::base64::pshufb_combine_table + pop1 * 8),
|
|
0);
|
|
__m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
|
|
|
|
__lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
|
|
}
|
|
|
|
struct block64 {
|
|
__m256i chunks[2];
|
|
};
|
|
|
|
template <bool base64_url>
|
|
static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
|
|
__m256i ascii_space_tbl =
|
|
____m256i((__m128i)v16u8{0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0});
|
|
// credit: aqrit
|
|
__m256i delta_asso =
|
|
____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0xF, 0x0, 0xF});
|
|
__m256i delta_values;
|
|
if (base64_url) {
|
|
delta_values = ____m256i(
|
|
(__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
|
|
int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
|
|
int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
|
|
int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)});
|
|
} else {
|
|
delta_values = ____m256i(
|
|
(__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
|
|
int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
|
|
int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
|
|
int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)});
|
|
}
|
|
|
|
__m256i check_asso;
|
|
if (base64_url) {
|
|
check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
|
|
0x0B, 0x06, 0x0B, 0x12});
|
|
} else {
|
|
check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
|
|
0x0B, 0x0B, 0x0B, 0x0F});
|
|
}
|
|
|
|
__m256i check_values;
|
|
if (base64_url) {
|
|
check_values = ____m256i(
|
|
(__m128i)v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80),
|
|
int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
|
|
int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
|
|
int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)});
|
|
} else {
|
|
check_values = ____m256i(
|
|
(__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
|
|
int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
|
|
int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
|
|
int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)});
|
|
}
|
|
|
|
__m256i shifted = __lasx_xvsrli_b(*src, 3);
|
|
__m256i asso_index = __lasx_xvand_v(*src, __lasx_xvldi(0xF));
|
|
__m256i delta_hash = __lasx_xvavgr_bu(
|
|
__lasx_xvshuf_b(delta_asso, delta_asso, asso_index), shifted);
|
|
__m256i check_hash = __lasx_xvavgr_bu(
|
|
__lasx_xvshuf_b(check_asso, check_asso, asso_index), shifted);
|
|
|
|
__m256i out = __lasx_xvsadd_b(
|
|
__lasx_xvshuf_b(delta_values, delta_values, delta_hash), *src);
|
|
__m256i chk = __lasx_xvsadd_b(
|
|
__lasx_xvshuf_b(check_values, check_values, check_hash), *src);
|
|
__m256i chk_ltz = __lasx_xvmskltz_b(chk);
|
|
unsigned int mask = __lasx_xvpickve2gr_wu(chk_ltz, 0);
|
|
mask = mask | (__lsx_vpickve2gr_hu(lasx_extracti128_hi(chk_ltz), 0) << 16);
|
|
if (mask) {
|
|
__m256i ascii_space = __lasx_xvseq_b(
|
|
__lasx_xvshuf_b(ascii_space_tbl, ascii_space_tbl, asso_index), *src);
|
|
__m256i ascii_space_ltz = __lasx_xvmskltz_b(ascii_space);
|
|
unsigned int ascii_space_mask = __lasx_xvpickve2gr_wu(ascii_space_ltz, 0);
|
|
ascii_space_mask =
|
|
ascii_space_mask |
|
|
(__lsx_vpickve2gr_hu(lasx_extracti128_hi(ascii_space_ltz), 0) << 16);
|
|
*error |= (mask != ascii_space_mask);
|
|
}
|
|
|
|
*src = out;
|
|
return (uint32_t)mask;
|
|
}
|
|
|
|
template <bool base64_url>
|
|
static inline uint64_t to_base64_mask(block64 *b, bool *error) {
|
|
*error = 0;
|
|
uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
|
|
uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
|
|
return m0 | (m1 << 32);
|
|
}
|
|
|
|
static inline void copy_block(block64 *b, char *output) {
|
|
__lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output), 0);
|
|
__lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output), 32);
|
|
}
|
|
|
|
static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
|
|
uint64_t nmask = ~mask;
|
|
uint64_t count =
|
|
__lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
|
|
uint16_t *count_ptr = (uint16_t *)&count;
|
|
compress(lasx_extracti128_lo(b->chunks[0]), uint16_t(mask), output);
|
|
compress(lasx_extracti128_hi(b->chunks[0]), uint16_t(mask >> 16),
|
|
output + count_ptr[0]);
|
|
compress(lasx_extracti128_lo(b->chunks[1]), uint16_t(mask >> 32),
|
|
output + count_ptr[0] + count_ptr[1]);
|
|
compress(lasx_extracti128_hi(b->chunks[1]), uint16_t(mask >> 48),
|
|
output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
|
|
return count_ones(nmask);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 64 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
static inline void load_block(block64 *b, const char *src) {
|
|
b->chunks[0] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
|
|
b->chunks[1] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
|
|
}
|
|
|
|
// The caller of this function is responsible to ensure that there are 128 bytes
|
|
// available from reading at src. The data is read into a block64 structure.
|
|
static inline void load_block(block64 *b, const char16_t *src) {
|
|
__m256i m1 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
|
|
__m256i m2 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
|
|
__m256i m3 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 64);
|
|
__m256i m4 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 96);
|
|
b->chunks[0] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m2, m1, 0), 0b11011000);
|
|
b->chunks[1] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m4, m3, 0), 0b11011000);
|
|
}
|
|
|
|
static inline void base64_decode(char *out, __m256i str) {
|
|
__m256i t0 = __lasx_xvor_v(
|
|
__lasx_xvslli_w(str, 26),
|
|
__lasx_xvslli_w(__lasx_xvand_v(str, lasx_splat_u32(0x0000ff00)), 12));
|
|
__m256i t1 =
|
|
__lasx_xvsrli_w(__lasx_xvand_v(str, lasx_splat_u32(0x003f0000)), 2);
|
|
__m256i t2 = __lasx_xvor_v(t0, t1);
|
|
__m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16));
|
|
__m256i pack_shuffle = ____m256i(
|
|
(__m128i)v16u8{3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, 0, 0, 0, 0});
|
|
t3 = __lasx_xvshuf_b(t3, t3, (__m256i)pack_shuffle);
|
|
|
|
// Store the output:
|
|
__lsx_vst(lasx_extracti128_lo(t3), out, 0);
|
|
__lsx_vst(lasx_extracti128_hi(t3), out, 12);
|
|
}
|
|
// decode 64 bytes and output 48 bytes
|
|
static inline void base64_decode_block(char *out, const char *src) {
|
|
base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
|
|
base64_decode(out + 24,
|
|
__lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
|
|
}
|
|
|
|
static inline void base64_decode_block_safe(char *out, const char *src) {
|
|
base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
|
|
char buffer[32];
|
|
base64_decode(buffer,
|
|
__lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
|
|
std::memcpy(out + 24, buffer, 24);
|
|
}
|
|
|
|
static inline void base64_decode_block(char *out, block64 *b) {
|
|
base64_decode(out, b->chunks[0]);
|
|
base64_decode(out + 24, b->chunks[1]);
|
|
}
|
|
static inline void base64_decode_block_safe(char *out, block64 *b) {
|
|
base64_decode(out, b->chunks[0]);
|
|
char buffer[32];
|
|
base64_decode(buffer, b->chunks[1]);
|
|
std::memcpy(out + 24, buffer, 24);
|
|
}
|
|
|
|
template <bool base64_url, bool ignore_garbage, typename chartype>
|
|
full_result
|
|
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
|
|
base64_options options,
|
|
last_chunk_handling_options last_chunk_options) {
|
|
const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
|
|
: tables::base64::to_base64_value;
|
|
size_t equallocation =
|
|
srclen; // location of the first padding character if any
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
size_t equalsigns = 0;
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 1;
|
|
// skip trailing spaces
|
|
while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
|
|
to_base64[uint8_t(src[srclen - 1])] == 64) {
|
|
srclen--;
|
|
}
|
|
if (srclen > 0 && src[srclen - 1] == '=') {
|
|
equallocation = srclen - 1;
|
|
srclen--;
|
|
equalsigns = 2;
|
|
}
|
|
}
|
|
if (srclen == 0) {
|
|
if (!ignore_garbage && equalsigns > 0) {
|
|
if (last_chunk_options == last_chunk_handling_options::strict) {
|
|
return {BASE64_INPUT_REMAINDER, 0, 0};
|
|
} else if (last_chunk_options ==
|
|
last_chunk_handling_options::stop_before_partial) {
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
return {INVALID_BASE64_CHARACTER, equallocation, 0};
|
|
}
|
|
return {SUCCESS, 0, 0};
|
|
}
|
|
char *end_of_safe_64byte_zone =
|
|
(srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
|
|
|
|
const chartype *const srcinit = src;
|
|
const char *const dstinit = dst;
|
|
const chartype *const srcend = src + srclen;
|
|
|
|
constexpr size_t block_size = 6;
|
|
static_assert(block_size >= 2, "block_size must be at least two");
|
|
char buffer[block_size * 64];
|
|
char *bufferptr = buffer;
|
|
if (srclen >= 64) {
|
|
const chartype *const srcend64 = src + srclen - 64;
|
|
while (src <= srcend64) {
|
|
block64 b;
|
|
load_block(&b, src);
|
|
src += 64;
|
|
bool error = false;
|
|
uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
|
|
if (error && !ignore_garbage) {
|
|
src -= 64;
|
|
while (src < srcend && scalar::base64::is_eight_byte(*src) &&
|
|
to_base64[uint8_t(*src)] <= 64) {
|
|
src++;
|
|
}
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
if (badcharmask != 0) {
|
|
// optimization opportunity: check for simple masks like those made of
|
|
// continuous 1s followed by continuous 0s. And masks containing a
|
|
// single bad character.
|
|
bufferptr += compress_block(&b, badcharmask, bufferptr);
|
|
} else if (bufferptr != buffer) {
|
|
copy_block(&b, bufferptr);
|
|
bufferptr += 64;
|
|
} else {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, &b);
|
|
} else {
|
|
base64_decode_block(dst, &b);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if (bufferptr >= (block_size - 1) * 64 + buffer) {
|
|
for (size_t i = 0; i < (block_size - 2); i++) {
|
|
base64_decode_block(dst, buffer + i * 64);
|
|
dst += 48;
|
|
}
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
|
|
} else {
|
|
base64_decode_block(dst, buffer + (block_size - 2) * 64);
|
|
}
|
|
dst += 48;
|
|
std::memcpy(buffer, buffer + (block_size - 1) * 64,
|
|
64); // 64 might be too much
|
|
bufferptr -= (block_size - 1) * 64;
|
|
}
|
|
}
|
|
}
|
|
|
|
char *buffer_start = buffer;
|
|
// Optimization note: if this is almost full, then it is worth our
|
|
// time, otherwise, we should just decode directly.
|
|
int last_block = (int)((bufferptr - buffer_start) % 64);
|
|
if (last_block != 0 && srcend - src + last_block >= 64) {
|
|
|
|
while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
|
|
uint8_t val = to_base64[uint8_t(*src)];
|
|
*bufferptr = char(val);
|
|
if ((!scalar::base64::is_eight_byte(*src) || val > 64) &&
|
|
!ignore_garbage) {
|
|
return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
|
|
size_t(dst - dstinit)};
|
|
}
|
|
bufferptr += (val <= 63);
|
|
src++;
|
|
}
|
|
}
|
|
|
|
for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
|
|
if (dst >= end_of_safe_64byte_zone) {
|
|
base64_decode_block_safe(dst, buffer_start);
|
|
} else {
|
|
base64_decode_block(dst, buffer_start);
|
|
}
|
|
dst += 48;
|
|
}
|
|
if ((bufferptr - buffer_start) % 64 != 0) {
|
|
while (buffer_start + 4 < bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
std::memcpy(dst, &triple, 4);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
if (buffer_start + 4 <= bufferptr) {
|
|
uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
|
|
(uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
|
|
<< 8;
|
|
triple = scalar::u32_swap_bytes(triple);
|
|
std::memcpy(dst, &triple, 3);
|
|
|
|
dst += 3;
|
|
buffer_start += 4;
|
|
}
|
|
// we may have 1, 2 or 3 bytes left and we need to decode them so let us
|
|
// backtrack
|
|
int leftover = int(bufferptr - buffer_start);
|
|
while (leftover > 0) {
|
|
if (!ignore_garbage) {
|
|
while (to_base64[uint8_t(*(src - 1))] == 64) {
|
|
src--;
|
|
}
|
|
} else {
|
|
while (to_base64[uint8_t(*(src - 1))] >= 64) {
|
|
src--;
|
|
}
|
|
}
|
|
src--;
|
|
leftover--;
|
|
}
|
|
}
|
|
if (src < srcend + equalsigns) {
|
|
full_result r = scalar::base64::base64_tail_decode(
|
|
dst, src, srcend - src, equalsigns, options, last_chunk_options);
|
|
r.input_count += size_t(src - srcinit);
|
|
if (r.error == error_code::INVALID_BASE64_CHARACTER ||
|
|
r.error == error_code::BASE64_EXTRA_BITS) {
|
|
return r;
|
|
} else {
|
|
r.output_count += size_t(dst - dstinit);
|
|
}
|
|
if (last_chunk_options != stop_before_partial &&
|
|
r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) {
|
|
// additional checks
|
|
if ((r.output_count % 3 == 0) ||
|
|
((r.output_count % 3) + 1 + equalsigns != 4)) {
|
|
r.error = error_code::INVALID_BASE64_CHARACTER;
|
|
r.input_count = equallocation;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
if (equalsigns > 0 && !ignore_garbage) {
|
|
if ((size_t(dst - dstinit) % 3 == 0) ||
|
|
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
|
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
|
}
|
|
}
|
|
return {SUCCESS, srclen, size_t(dst - dstinit)};
|
|
}
|
|
/* end file src/lasx/lasx_base64.cpp */
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
|
|
} // namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with
|
|
// spaces
|
|
template <size_t STEP_SIZE> struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0
|
|
* (in which case this function fills the buffer with spaces and returns 0. In
|
|
* particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
|
|
* block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text_64(const uint8_t *text) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
|
|
static char *buf =
|
|
reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t *>(buf));
|
|
for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') {
|
|
buf[i] = '_';
|
|
}
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char *format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
|
|
for (size_t i = 0; i < 64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline
|
|
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
|
|
: buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
|
|
idx{0} {}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
|
|
return idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *
|
|
buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline size_t
|
|
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if (len == idx) {
|
|
return 0;
|
|
} // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20,
|
|
STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
|
|
// to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template <size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the
|
|
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they
|
|
// ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
255,
|
|
0b11110000u - 1,
|
|
0b11100000u - 1,
|
|
0b11000000u - 1};
|
|
const simd8<uint8_t> max_value(
|
|
&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast
|
|
// path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is
|
|
// too short or a byte value too large in the last bytes: check_special_cases
|
|
// only checks for bytes too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an
|
|
// ASCII block can't possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
|
|
if (simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete =
|
|
is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template <class checker>
|
|
bool generic_validate_utf8(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char *input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template <class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) {
|
|
count--;
|
|
} // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(
|
|
reinterpret_cast<const char *>(input),
|
|
reinterpret_cast<const char *>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char *input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(
|
|
reinterpret_cast<const uint8_t *>(input), length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
/* begin file src/generic/ascii_validation.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace ascii_validation {
|
|
|
|
bool generic_validate_ascii(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char *input, size_t length) {
|
|
buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(
|
|
reinterpret_cast<const char *>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
} // namespace ascii_validation
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/ascii_validation.h */
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
// transcoding from UTF-8 to Latin 1
|
|
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// For UTF-8 to Latin 1, we can allow any ASCII character, and any
|
|
// continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
|
|
// 0b11000010 and nothing else.
|
|
//
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
constexpr const uint8_t FORBIDDEN = 0xff;
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
FORBIDDEN,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
FORBIDDEN);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
FORBIDDEN,
|
|
// ____0101 ________
|
|
FORBIDDEN,
|
|
// ____011_ ________
|
|
FORBIDDEN, FORBIDDEN,
|
|
|
|
// ____1___ ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
|
|
// ____1101 ________
|
|
FORBIDDEN, FORBIDDEN, FORBIDDEN);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
this->error |= check_special_cases(input, prev1);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 16; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, latin1_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
latin1_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, latin1_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_latin1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
|
|
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_to_latin1 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t convert_valid(const char *in, size_t size,
|
|
char *latin1_output) {
|
|
size_t pos = 0;
|
|
char *start{latin1_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
|
|
// 16 bytes, and if the data is valid, then it is entirely safe because 16
|
|
// UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
|
|
// assume that you have valid UTF-8 input, so we are going to go back from the
|
|
// end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) >
|
|
-65); // twos complement of -65 is 1011 1111 ...
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store((int8_t *)latin1_output);
|
|
latin1_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it
|
|
// is not good enough.
|
|
uint64_t utf8_continuation_mask =
|
|
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
|
// this case, we also have ASCII to account for.
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_latin1(
|
|
in + pos, utf8_end_of_code_point_mask, latin1_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
|
|
latin1_output);
|
|
latin1_output += howmany;
|
|
}
|
|
return latin1_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_latin1
|
|
} // namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
// namespace simdutf
|
|
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
// transcoding from UTF-8 to UTF-16
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char16_t *utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the
|
|
// generic directory.
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the
|
|
// mask far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow
|
|
// path. Anything that is not a continuation mask is a 'leading byte',
|
|
// that is, the start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end*
|
|
// of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
input + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
|
|
input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(
|
|
in + pos, size - pos, utf16_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char16_t *utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t *start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last
|
|
// leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
// rewind_and_convert_with_errors will seek a potential error from
|
|
// in+pos onward, with the ability to go back up to pos bytes, and
|
|
// read size-pos bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(
|
|
in + pos, utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos
|
|
// onward, with the ability to go back up to pos bytes, and read size-pos
|
|
// bytes forward.
|
|
result res =
|
|
scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
|
|
pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
/* begin file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 2;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
|
|
size_t iterations = 0;
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + N <= size; pos += N) {
|
|
const auto input =
|
|
vector_i8::load(reinterpret_cast<const int8_t *>(in + pos));
|
|
|
|
const auto continuation = input > int8_t(-65);
|
|
const auto utf_4bytes = vector_u8(input.value) >= uint8_t(240);
|
|
|
|
local -= vector_u8(continuation);
|
|
local -= vector_u8(utf_4bytes);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8/utf16_length_from_utf8_bytemask.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
// transcoding from UTF-8 to UTF-32
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
|
|
char32_t *utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if (in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation
|
|
// byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while (pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
input + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
|
|
utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
|
|
constexpr const uint8_t CARRY =
|
|
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low =
|
|
(prev1 & 0x0F)
|
|
.lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY, CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
|
|
OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input,
|
|
const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 =
|
|
simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
|
|
// lead bytes (2, 3, 4-byte leads become large positive numbers instead of
|
|
// small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
simdutf_really_inline size_t convert(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 16 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (utf8_continuation_mask & 1) {
|
|
return 0; // we have an error
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
return 0;
|
|
}
|
|
if (pos < size) {
|
|
size_t howmany =
|
|
scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if (howmany == 0) {
|
|
return 0;
|
|
}
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char *in, size_t size,
|
|
char32_t *utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t *start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow
|
|
// of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
|
|
// last 16 bytes, and if the data is valid, then it is entirely safe because
|
|
// 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
|
|
// generally assume that you have valid UTF-8 input, so we are going to go
|
|
// back from the end counting 8 leading bytes, to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for (; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin - 1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth
|
|
// last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while (pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if (input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio,
|
|
// it is not good enough.
|
|
static_assert(
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
|
|
(simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
if (errors() || (utf8_continuation_mask & 1)) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while (pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(
|
|
in + pos, utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if (pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
|
|
pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
/* begin file src/generic/utf8.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.gt(-65);
|
|
count += count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
|
|
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
|
|
size_t size) {
|
|
using vector_i8 = simd8<int8_t>;
|
|
using vector_u8 = simd8<uint8_t>;
|
|
using vector_u64 = simd64<uint64_t>;
|
|
|
|
constexpr size_t N = vector_i8::SIZE;
|
|
constexpr size_t max_iterations = 255 / 4;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
auto counters = vector_u64::zero();
|
|
auto local = vector_u8::zero();
|
|
size_t iterations = 0;
|
|
for (; pos + 4 * N <= size; pos += 4 * N) {
|
|
const auto input0 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
|
|
const auto input1 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
|
|
const auto input2 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
|
|
const auto input3 =
|
|
simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
|
|
const auto mask0 = input0 > int8_t(-65);
|
|
const auto mask1 = input1 > int8_t(-65);
|
|
const auto mask2 = input2 > int8_t(-65);
|
|
const auto mask3 = input3 > int8_t(-65);
|
|
|
|
local -= vector_u8(mask0);
|
|
local -= vector_u8(mask1);
|
|
local -= vector_u8(mask2);
|
|
local -= vector_u8(mask3);
|
|
|
|
iterations += 1;
|
|
if (iterations == max_iterations) {
|
|
counters += sum_8bytes(local);
|
|
local = vector_u8::zero();
|
|
iterations = 0;
|
|
}
|
|
}
|
|
|
|
if (iterations > 0) {
|
|
count += local.sum_bytes();
|
|
}
|
|
|
|
count += counters.sum();
|
|
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
#endif // SIMDUTF_SIMD_HAS_BYTEMASK
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for (; pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf8
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
/* begin file src/generic/utf16/count_code_points_bytemask.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t *in,
|
|
size_t size) {
|
|
using vector_u16 = simd16<uint16_t>;
|
|
constexpr size_t N = vector_u16::ELEMENTS;
|
|
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
|
|
constexpr size_t max_itertions = 65535;
|
|
const auto one = vector_u16::splat(1);
|
|
const auto zero = vector_u16::zero();
|
|
|
|
size_t itertion = 0;
|
|
|
|
auto counters = zero;
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vector_u16::load(in + pos);
|
|
if (!match_system(big_endian)) {
|
|
input = input.swap_bytes();
|
|
}
|
|
|
|
const auto t0 = input & uint16_t(0xfc00);
|
|
const auto t1 = t0 ^ uint16_t(0xdc00);
|
|
|
|
// t2[0] == 1 iff input[0] outside range 0xdc00..dfff (the word is not a
|
|
// high surrogate)
|
|
const auto t2 = min(t1, one);
|
|
|
|
counters += t2;
|
|
|
|
itertion += 1;
|
|
if (itertion == max_itertions) {
|
|
count += counters.sum();
|
|
counters = zero;
|
|
itertion = 0;
|
|
}
|
|
}
|
|
|
|
if (itertion > 0) {
|
|
count += counters.sum();
|
|
}
|
|
|
|
return count +
|
|
scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/count_code_points_bytemask.h */
|
|
/* begin file src/generic/utf16/change_endianness.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
simdutf_really_inline void
|
|
change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos < size / 32 * 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/change_endianness.h */
|
|
/* begin file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
|
|
size_t size) {
|
|
size_t pos = 0;
|
|
|
|
using vector_u16 = simd16<uint16_t>;
|
|
constexpr size_t N = vector_u16::ELEMENTS;
|
|
|
|
const auto one = vector_u16::splat(1);
|
|
|
|
auto v_count = vector_u16::zero();
|
|
|
|
// each char16 yields at least one byte
|
|
size_t count = size / N * N;
|
|
|
|
// in a single iteration the increment is 0, 1 or 2, despite we have
|
|
// three additions
|
|
constexpr size_t max_iterations = 65535 / 2;
|
|
size_t iteration = max_iterations;
|
|
|
|
for (; pos < size / N * N; pos += N) {
|
|
auto input = vector_u16::load(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) {
|
|
input = input.swap_bytes();
|
|
}
|
|
// 0xd800 .. 0xdbff - low surrogate
|
|
// 0xdc00 .. 0xdfff - high surrogate
|
|
const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
|
|
|
|
// c0 - chars that yield 2- or 3-byte UTF-8 codes
|
|
const auto c0 = min(input & uint16_t(0xff80), one);
|
|
|
|
// c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
|
|
const auto c1 = min(input & uint16_t(0xf800), one);
|
|
|
|
/*
|
|
Explanation how the counting works.
|
|
|
|
In the case of a non-surrogate character we count:
|
|
* always 1 -- see how `count` is initialized above;
|
|
* c0 = 1 if the current char yields 2 or 3 bytes;
|
|
* c1 = 1 if the current char yields 3 bytes.
|
|
|
|
Thus, we always have correct count for the current char:
|
|
from 1, 2 or 3 bytes.
|
|
|
|
A trickier part is how we count surrogate pairs. Whether
|
|
we encounter a surrogate (low or high), we count it as
|
|
3 chars and then minus 1 (`is_surrogate` is -1 or 0).
|
|
Each surrogate char yields 2. A surrogate pair, that
|
|
is a low surrogate followed by a high one, yields
|
|
the expected 4 bytes.
|
|
|
|
It also correctly handles cases when low surrogate is
|
|
processed by the this loop, but high surrogate is counted
|
|
by the scalar procedure. The scalar procedure uses exactly
|
|
the described approach, thanks to that for valid UTF-16
|
|
strings it always count correctly.
|
|
*/
|
|
v_count += c0;
|
|
v_count += c1;
|
|
v_count += vector_u16(is_surrogate);
|
|
|
|
iteration -= 1;
|
|
if (iteration == 0) {
|
|
count += v_count.sum();
|
|
v_count = vector_u16::zero();
|
|
iteration = max_iterations;
|
|
}
|
|
}
|
|
|
|
if (iteration > 0) {
|
|
count += v_count.sum();
|
|
}
|
|
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
|
|
size - pos);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/utf8_length_from_utf16_bytemask.h */
|
|
/* begin file src/generic/utf16/utf32_length_from_utf16.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
|
|
size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16/utf32_length_from_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
/* begin file src/generic/validate_utf16.h */
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf16 {
|
|
/*
|
|
UTF-16 validation
|
|
--------------------------------------------------
|
|
|
|
In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
|
|
|
|
In a vectorized algorithm we want to examine the most significant
|
|
nibble in order to select a fast path. If none of highest nibbles
|
|
are 0xD (13), than we are sure that UTF-16 chunk in a vector
|
|
register is valid.
|
|
|
|
Let us analyze what we need to check if the nibble is 0xD. The
|
|
value of the preceding nibble determines what we have:
|
|
|
|
0xd000 .. 0xd7ff - a valid word
|
|
0xd800 .. 0xdbff - low surrogate
|
|
0xdc00 .. 0xdfff - high surrogate
|
|
|
|
Other constraints we have to consider:
|
|
- there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
|
|
- there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
|
|
- there must not be sole low surrogate nor high surrogate
|
|
|
|
We are going to build three bitmasks based on the 3rd nibble:
|
|
- V = valid word,
|
|
- L = low surrogate (0xd800 .. 0xdbff)
|
|
- H = high surrogate (0xdc00 .. 0xdfff)
|
|
|
|
0 1 2 3 4 5 6 7 <--- word index
|
|
[ V | L | H | L | H | V | V | L ]
|
|
1 0 0 0 0 1 1 0 - V = valid masks
|
|
0 1 0 1 0 0 0 1 - L = low surrogate
|
|
0 0 1 0 1 0 0 0 - H high surrogate
|
|
|
|
|
|
1 0 0 0 0 1 1 0 V = valid masks
|
|
0 1 0 1 0 0 0 0 a = L & (H >> 1)
|
|
0 0 1 0 1 0 0 0 b = a << 1
|
|
1 1 1 1 1 1 1 0 c = V | a | b
|
|
^
|
|
the last bit can be zero, we just consume 7
|
|
code units and recheck this word in the next iteration
|
|
*/
|
|
template <endianness big_endian>
|
|
const result validate_utf16_with_errors(const char16_t *input, size_t size) {
|
|
if (simdutf_unlikely(size == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
|
|
const char16_t *start = input;
|
|
const char16_t *end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 =
|
|
simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
// Function `utf16_gather_high_bytes` consumes two vectors of UTF-16
|
|
// and yields a single vector having only higher bytes of characters.
|
|
const auto in = utf16_gather_high_bytes<big_endian>(in0, in1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask =
|
|
static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0000) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher byte)
|
|
|
|
// V - non-surrogate code units
|
|
// V = not surrogates_wordmask
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(
|
|
L & (H >> 1)); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint16_t b = static_cast<uint16_t>(
|
|
a << 1); // Just mark that the opinput - startite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint16_t c = static_cast<uint16_t>(
|
|
V | a | b); // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single code units or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
// The 15 lower code units of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
|
|
} // namespace utf16
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/validate_utf16.h */
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
/* begin file src/generic/utf32.h */
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
template <typename T> T min(T a, T b) { return a <= b ? a : b; }
|
|
|
|
simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
|
|
size_t length) {
|
|
using vector_u32 = simd32<uint32_t>;
|
|
|
|
const char32_t *start = input;
|
|
|
|
// we add up to three ones in a single iteration (see the vectorized loop in
|
|
// section #2 below)
|
|
const size_t max_increment = 3;
|
|
|
|
const size_t N = vector_u32::ELEMENTS;
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
const auto v_0000007f = vector_u32::splat(0x0000007f);
|
|
const auto v_000007ff = vector_u32::splat(0x000007ff);
|
|
const auto v_0000ffff = vector_u32::splat(0x0000ffff);
|
|
#else
|
|
const auto v_ffffff80 = vector_u32::splat(0xffffff80);
|
|
const auto v_fffff800 = vector_u32::splat(0xfffff800);
|
|
const auto v_ffff0000 = vector_u32::splat(0xffff0000);
|
|
const auto one = vector_u32::splat(1);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
size_t counter = 0;
|
|
|
|
// 1. vectorized loop unrolled 4 times
|
|
{
|
|
// we use vector of uint32 counters, this is why this limit is used
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / (max_increment * 4);
|
|
size_t blocks = length / (N * 4);
|
|
length -= blocks * (N * 4);
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
simd32<uint32_t> acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in0 = vector_u32(input + 0 * N);
|
|
const auto in1 = vector_u32(input + 1 * N);
|
|
const auto in2 = vector_u32(input + 2 * N);
|
|
const auto in3 = vector_u32(input + 3 * N);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in0 > v_0000007f);
|
|
acc -= as_vector_u32(in1 > v_0000007f);
|
|
acc -= as_vector_u32(in2 > v_0000007f);
|
|
acc -= as_vector_u32(in3 > v_0000007f);
|
|
|
|
acc -= as_vector_u32(in0 > v_000007ff);
|
|
acc -= as_vector_u32(in1 > v_000007ff);
|
|
acc -= as_vector_u32(in2 > v_000007ff);
|
|
acc -= as_vector_u32(in3 > v_000007ff);
|
|
|
|
acc -= as_vector_u32(in0 > v_0000ffff);
|
|
acc -= as_vector_u32(in1 > v_0000ffff);
|
|
acc -= as_vector_u32(in2 > v_0000ffff);
|
|
acc -= as_vector_u32(in3 > v_0000ffff);
|
|
#else
|
|
acc += min(one, in0 & v_ffffff80);
|
|
acc += min(one, in1 & v_ffffff80);
|
|
acc += min(one, in2 & v_ffffff80);
|
|
acc += min(one, in3 & v_ffffff80);
|
|
|
|
acc += min(one, in0 & v_fffff800);
|
|
acc += min(one, in1 & v_fffff800);
|
|
acc += min(one, in2 & v_fffff800);
|
|
acc += min(one, in3 & v_fffff800);
|
|
|
|
acc += min(one, in0 & v_ffff0000);
|
|
acc += min(one, in1 & v_ffff0000);
|
|
acc += min(one, in2 & v_ffff0000);
|
|
acc += min(one, in3 & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += 4 * N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
// 2. vectorized loop for tail
|
|
{
|
|
const size_t max_iterations =
|
|
std::numeric_limits<uint32_t>::max() / max_increment;
|
|
size_t blocks = length / N;
|
|
length -= blocks * N;
|
|
while (blocks != 0) {
|
|
const size_t iterations = min(blocks, max_iterations);
|
|
blocks -= iterations;
|
|
|
|
auto acc = vector_u32::zero();
|
|
for (size_t i = 0; i < iterations; i++) {
|
|
const auto in = vector_u32(input);
|
|
|
|
#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
acc -= as_vector_u32(in > v_0000007f);
|
|
acc -= as_vector_u32(in > v_000007ff);
|
|
acc -= as_vector_u32(in > v_0000ffff);
|
|
#else
|
|
acc += min(one, in & v_ffffff80);
|
|
acc += min(one, in & v_fffff800);
|
|
acc += min(one, in & v_ffff0000);
|
|
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
|
|
input += N;
|
|
}
|
|
|
|
counter += acc.sum();
|
|
}
|
|
}
|
|
|
|
const size_t consumed = input - start;
|
|
if (consumed != 0) {
|
|
// We don't count 0th bytes in the vectorized loops above, this
|
|
// is why we need to count them in the end.
|
|
counter += consumed;
|
|
}
|
|
|
|
return counter + scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
} // namespace utf32
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf32.h */
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdutf {
|
|
namespace lasx {
|
|
|
|
#if SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
// todo: reimplement as a one-pass algorithm.
|
|
if (bom_encoding != encoding_type::unspecified) {
|
|
return bom_encoding;
|
|
}
|
|
int out = 0;
|
|
if (validate_utf8(input, length)) {
|
|
out |= encoding_type::UTF8;
|
|
}
|
|
if ((length % 2) == 0) {
|
|
if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
|
|
length / 2)) {
|
|
out |= encoding_type::UTF16_LE;
|
|
}
|
|
}
|
|
if ((length % 4) == 0) {
|
|
if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
|
|
out |= encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return lasx::utf8_validation::generic_validate_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return lasx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_ASCII
|
|
simdutf_warn_unused bool
|
|
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return lasx::ascii_validation::generic_validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(
|
|
const char *buf, size_t len) const noexcept {
|
|
return lasx::ascii_validation::generic_validate_ascii_with_errors(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_ASCII
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16le(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const auto res =
|
|
lasx::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count != len) {
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf16be(const char16_t *buf,
|
|
size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
|
|
const auto res =
|
|
lasx::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.is_err()) {
|
|
return false;
|
|
}
|
|
|
|
if (res.count != len) {
|
|
return scalar::utf16::validate<endianness::BIG>(buf + res.count,
|
|
len - res.count);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
const result res =
|
|
lasx::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
const result scalar_res =
|
|
scalar::utf16::validate_with_errors<endianness::LITTLE>(
|
|
buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(
|
|
const char16_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
const result res =
|
|
lasx::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
const result scalar_res =
|
|
scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count,
|
|
len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16le(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(input, len,
|
|
output);
|
|
}
|
|
|
|
void implementation::to_well_formed_utf16be(const char16_t *input, size_t len,
|
|
char16_t *output) const noexcept {
|
|
return scalar::utf16::to_well_formed_utf16<endianness::BIG>(input, len,
|
|
output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
simdutf_warn_unused bool
|
|
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
// empty input is valid. protected the implementation from nullptr.
|
|
return true;
|
|
}
|
|
const char32_t *tail = lasx_validate_utf32le(buf, len);
|
|
if (tail) {
|
|
return scalar::utf32::validate(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
|
|
|
|
#if SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(
|
|
const char32_t *buf, size_t len) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
result res = lasx_validate_utf32le_with_errors(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
|
|
const char *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char *, char *> ret =
|
|
lasx_convert_latin1_to_utf8(buf, len, utf8_output);
|
|
size_t converted_chars = ret.second - utf8_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
lasx_convert_latin1_to_utf16le(buf, len, utf16_output);
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char *, char16_t *> ret =
|
|
lasx_convert_latin1_to_utf16be(buf, len, utf16_output);
|
|
size_t converted_chars = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars =
|
|
scalar::latin1_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char *, char32_t *> ret =
|
|
lasx_convert_latin1_to_utf32(buf, len, utf32_output);
|
|
size_t converted_chars = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
converted_chars += scalar_converted_chars;
|
|
}
|
|
return converted_chars;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
size_t pos = 0;
|
|
char *output_start{latin1_output};
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)latin1_output & 0x1F) && pos < len) {
|
|
if (buf[pos] & 0x80) {
|
|
if (pos + 1 >= len)
|
|
return 0;
|
|
if ((buf[pos] & 0b11100000) == 0b11000000) {
|
|
if ((buf[pos + 1] & 0b11000000) != 0b10000000)
|
|
return 0;
|
|
uint32_t code_point =
|
|
(buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0xFF < code_point) {
|
|
return 0;
|
|
}
|
|
*latin1_output++ = char(code_point);
|
|
pos += 2;
|
|
} else {
|
|
return 0;
|
|
}
|
|
} else {
|
|
*latin1_output++ = char(buf[pos]);
|
|
pos++;
|
|
}
|
|
}
|
|
size_t convert_size = latin1_output - output_start;
|
|
if (pos == len)
|
|
return convert_size;
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
size_t convert_result =
|
|
converter.convert(buf + pos, len - pos, latin1_output);
|
|
return convert_result ? convert_size + convert_result : 0;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
size_t pos = 0;
|
|
char *output_start{latin1_output};
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)latin1_output & 0x1F) && pos < len) {
|
|
if (buf[pos] & 0x80) {
|
|
if ((buf[pos] & 0b11100000) == 0b11000000) {
|
|
if (pos + 1 >= len)
|
|
return result(error_code::TOO_SHORT, pos);
|
|
if ((buf[pos + 1] & 0b11000000) != 0b10000000)
|
|
return result(error_code::TOO_SHORT, pos);
|
|
uint32_t code_point =
|
|
(buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80)
|
|
return result(error_code::OVERLONG, pos);
|
|
if (0xFF < code_point)
|
|
return result(error_code::TOO_LARGE, pos);
|
|
*latin1_output++ = char(code_point);
|
|
pos += 2;
|
|
} else if ((buf[pos] & 0b11110000) == 0b11100000) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
} else if ((buf[pos] & 0b11111000) == 0b11110000) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
} else {
|
|
if ((buf[pos] & 0b11000000) == 0b10000000) {
|
|
return result(error_code::TOO_LONG, pos);
|
|
}
|
|
return result(error_code::HEADER_BITS, pos);
|
|
}
|
|
} else {
|
|
*latin1_output++ = char(buf[pos]);
|
|
pos++;
|
|
}
|
|
}
|
|
size_t convert_size = latin1_output - output_start;
|
|
if (pos == len)
|
|
return result(error_code::SUCCESS, convert_size);
|
|
|
|
utf8_to_latin1::validating_transcoder converter;
|
|
result res =
|
|
converter.convert_with_errors(buf + pos, len - pos, latin1_output);
|
|
return res.error ? result(res.error, res.count + pos)
|
|
: result(res.error, res.count + convert_size);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
|
|
const char *buf, size_t len, char *latin1_output) const noexcept {
|
|
size_t pos = 0;
|
|
char *output_start{latin1_output};
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while (((uint64_t)latin1_output & 0x1F) && pos < len) {
|
|
if (buf[pos] & 0x80) {
|
|
if (pos + 1 >= len)
|
|
break;
|
|
if ((buf[pos] & 0b11100000) == 0b11000000) {
|
|
if ((buf[pos + 1] & 0b11000000) != 0b10000000)
|
|
return 0;
|
|
uint32_t code_point =
|
|
(buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
|
|
*latin1_output++ = char(code_point);
|
|
pos += 2;
|
|
} else {
|
|
return 0;
|
|
}
|
|
} else {
|
|
*latin1_output++ = char(buf[pos]);
|
|
pos++;
|
|
}
|
|
}
|
|
size_t convert_size = latin1_output - output_start;
|
|
if (pos == len)
|
|
return convert_size;
|
|
|
|
size_t convert_result =
|
|
lasx::utf8_to_latin1::convert_valid(buf + pos, len - pos, latin1_output);
|
|
return convert_result ? convert_size + convert_result : 0;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
|
|
const char *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
|
|
utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
|
|
const char *input, size_t size, char16_t *utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
|
|
utf16_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
|
|
const char *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
|
|
const char *input, size_t size, char32_t *utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lasx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lasx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_latin1::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16le_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
lasx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
|
|
buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result
|
|
implementation::convert_utf16be_to_latin1_with_errors(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
lasx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
|
|
latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf16be_to_latin1(buf, len, latin1_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
|
|
const char16_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf16le_to_latin1(buf, len, latin1_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lasx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
std::pair<const char16_t *, char *> ret =
|
|
lasx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
lasx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
|
|
utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
lasx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
|
|
utf8_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
|
|
const char16_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return 0;
|
|
}
|
|
std::pair<const char32_t *, char *> ret =
|
|
lasx_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
if (simdutf_unlikely(len == 0)) {
|
|
return result(error_code::SUCCESS, 0);
|
|
}
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char *> ret =
|
|
lasx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf8_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
lasx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
std::pair<const char16_t *, char32_t *> ret =
|
|
lasx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
lasx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
|
|
utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char32_t *> ret =
|
|
lasx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
|
|
utf32_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res =
|
|
scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf32_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
lasx_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<result, char *> ret =
|
|
lasx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
|
|
if (ret.first.error) {
|
|
return ret.first;
|
|
} // Can return directly since scalar fallback already found correct
|
|
// ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
latin1_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
|
|
const char32_t *buf, size_t len, char *latin1_output) const noexcept {
|
|
std::pair<const char32_t *, char *> ret =
|
|
lasx_convert_utf32_to_latin1(buf, len, latin1_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - latin1_output;
|
|
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
|
|
const char32_t *buf, size_t len, char *utf8_output) const noexcept {
|
|
// optimization opportunity: implement a custom function.
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
lasx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
std::pair<const char32_t *, char16_t *> ret =
|
|
lasx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) {
|
|
return 0;
|
|
}
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes =
|
|
scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) {
|
|
return 0;
|
|
}
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
lasx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
|
|
utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of
|
|
// code units written even if finished
|
|
std::pair<result, char16_t *> ret =
|
|
lasx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
|
|
utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res =
|
|
scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count =
|
|
ret.second -
|
|
utf16_output; // Set count to the number of 8-bit code units written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
|
|
const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
|
|
const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16
|
|
void implementation::change_endianness_utf16(const char16_t *input,
|
|
size_t length,
|
|
char16_t *output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8
|
|
simdutf_warn_unused size_t
|
|
implementation::count_utf8(const char *input, size_t length) const noexcept {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// Performance degradation when memory address is not 32-byte aligned
|
|
while ((((uint64_t)input + pos) & 0x1F && pos < length)) {
|
|
if (input[pos++] > -65) {
|
|
count++;
|
|
}
|
|
}
|
|
__m256i v_bf = __lasx_xvldi(0xBF); // 0b10111111
|
|
for (; pos + 32 <= length; pos += 32) {
|
|
__m256i in = __lasx_xvld(reinterpret_cast<const int8_t *>(input + pos), 0);
|
|
__m256i utf8_count =
|
|
__lasx_xvpcnt_h(__lasx_xvmskltz_b(__lasx_xvslt_b(v_bf, in)));
|
|
count = count + __lasx_xvpickve2gr_wu(utf8_count, 0) +
|
|
__lasx_xvpickve2gr_wu(utf8_count, 4);
|
|
}
|
|
return count + scalar::utf8::count_code_points(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
|
|
const char *buf, size_t len) const noexcept {
|
|
return count_utf8(buf, len);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
|
|
const char *input, size_t length) const noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
|
|
const uint8_t *data_end = data + length;
|
|
uint64_t result = 0;
|
|
while (data_end - data > 16) {
|
|
uint64_t two_bytes = 0;
|
|
__m128i input_vec = __lsx_vld(data, 0);
|
|
two_bytes =
|
|
__lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
|
|
result += 16 + two_bytes;
|
|
data += 16;
|
|
}
|
|
return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
|
|
data_end - data);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::LITTLE>(input,
|
|
length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16_bytemask<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
|
|
const char16_t *input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8_bytemask(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
return utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
|
|
const char32_t *input, size_t length) const noexcept {
|
|
__m128i v_ffff = lsx_splat_u32(0x0000ffff);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for (; pos + 4 <= length; pos += 4) {
|
|
__m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
|
|
__m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
|
|
size_t surrogate_count = __lsx_vpickve2gr_bu(
|
|
__lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
|
|
count += 4 + surrogate_count;
|
|
}
|
|
return count +
|
|
scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
|
|
const char *input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
|
|
|
|
#if SIMDUTF_FEATURE_BASE64
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::base64_to_binary(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused full_result implementation::base64_to_binary_details(
|
|
const char16_t *input, size_t length, char *output, base64_options options,
|
|
last_chunk_handling_options last_chunk_options) const noexcept {
|
|
if (options & base64_url) {
|
|
if (options == base64_options::base64_url_accept_garbage) {
|
|
return compress_decode_base64<true, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<true, false>(output, input, length, options,
|
|
last_chunk_options);
|
|
}
|
|
} else {
|
|
if (options == base64_options::base64_default_accept_garbage) {
|
|
return compress_decode_base64<false, true>(output, input, length, options,
|
|
last_chunk_options);
|
|
} else {
|
|
return compress_decode_base64<false, false>(output, input, length,
|
|
options, last_chunk_options);
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t implementation::binary_to_base64(const char *input, size_t length,
|
|
char *output,
|
|
base64_options options) const noexcept {
|
|
if (options & base64_url) {
|
|
return encode_base64<true>(output, input, length, options);
|
|
} else {
|
|
return encode_base64<false>(output, input, length, options);
|
|
}
|
|
}
|
|
#endif // SIMDUTF_FEATURE_BASE64
|
|
} // namespace lasx
|
|
} // namespace simdutf
|
|
|
|
/* begin file src/simdutf/lasx/end.h */
|
|
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
|
|
/* end file src/simdutf/lasx/end.h */
|
|
/* end file src/lasx/implementation.cpp */
|
|
#endif
|
|
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
/* end file src/simdutf.cpp */
|