VentoyPlugson: Fix the garbled preview json for unicode characters.

2026-07-22 17:58:13 +00:00 · 2022-11-23 14:27:58 +08:00
parent 9da5a6ce65
commit 44ff3dd8d4
12 changed files with 421 additions and 12 deletions
--- a/Plugson/build.sh
+++ b/Plugson/build.sh
@@ -45,6 +45,7 @@ build_func() {
        src/Core/ventoy_json.c \
        src/Core/ventoy_log.c \
        src/Core/ventoy_md5.c \
+        src/Core/ventoy_utf.c \
        src/Core/ventoy_util.c \
        src/Core/ventoy_util_linux.c \
        src/Web/*.c \
--- a/Plugson/src/Core/ventoy_utf.c
+++ b/Plugson/src/Core/ventoy_utf.c
@@ -0,0 +1,367 @@
+/******************************************************************************
+ * ventoy_utf.c  ---- ventoy utf
+ * Copyright (c) 2022, Davipb https://github.com/Davipb/utf8-utf16-converter
+ * Copyright (c) 2022, longpanda <admin@ventoy.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 3 of the
+ * License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <ventoy_define.h>
+#include <ventoy_util.h>
+
+typedef uint8_t utf8_t; // The type of a single UTF-8 character
+typedef uint16_t utf16_t; // The type of a single UTF-16 character
+
+
+// The type of a single Unicode codepoint
+typedef uint32_t codepoint_t;
+
+// The last codepoint of the Basic Multilingual Plane, which is the part of Unicode that
+// UTF-16 can encode without surrogates
+#define BMP_END 0xFFFF
+
+// The highest valid Unicode codepoint
+#define UNICODE_MAX 0x10FFFF
+
+// The codepoint that is used to replace invalid encodings
+#define INVALID_CODEPOINT 0xFFFD
+
+// If a character, masked with GENERIC_SURROGATE_MASK, matches this value, it is a surrogate.
+#define GENERIC_SURROGATE_VALUE 0xD800
+// The mask to apply to a character before testing it against GENERIC_SURROGATE_VALUE
+#define GENERIC_SURROGATE_MASK 0xF800
+
+// If a character, masked with SURROGATE_MASK, matches this value, it is a high surrogate.
+#define HIGH_SURROGATE_VALUE 0xD800
+// If a character, masked with SURROGATE_MASK, matches this value, it is a low surrogate.
+#define LOW_SURROGATE_VALUE 0xDC00
+// The mask to apply to a character before testing it against HIGH_SURROGATE_VALUE or LOW_SURROGATE_VALUE
+#define SURROGATE_MASK 0xFC00
+
+// The value that is subtracted from a codepoint before encoding it in a surrogate pair
+#define SURROGATE_CODEPOINT_OFFSET 0x10000
+// A mask that can be applied to a surrogate to extract the codepoint value contained in it
+#define SURROGATE_CODEPOINT_MASK 0x03FF
+// The number of bits of SURROGATE_CODEPOINT_MASK
+#define SURROGATE_CODEPOINT_BITS 10
+
+
+// The highest codepoint that can be encoded with 1 byte in UTF-8
+#define UTF8_1_MAX 0x7F
+// The highest codepoint that can be encoded with 2 bytes in UTF-8
+#define UTF8_2_MAX 0x7FF
+// The highest codepoint that can be encoded with 3 bytes in UTF-8
+#define UTF8_3_MAX 0xFFFF
+// The highest codepoint that can be encoded with 4 bytes in UTF-8
+#define UTF8_4_MAX 0x10FFFF
+
+// If a character, masked with UTF8_CONTINUATION_MASK, matches this value, it is a UTF-8 continuation byte
+#define UTF8_CONTINUATION_VALUE 0x80
+// The mask to a apply to a character before testing it against UTF8_CONTINUATION_VALUE
+#define UTF8_CONTINUATION_MASK 0xC0
+// The number of bits of a codepoint that are contained in a UTF-8 continuation byte
+#define UTF8_CONTINUATION_CODEPOINT_BITS 6
+
+// Represents a UTF-8 bit pattern that can be set or verified
+typedef struct
+{
+    // The mask that should be applied to the character before testing it
+    utf8_t mask;
+    // The value that the character should be tested against after applying the mask
+    utf8_t value;
+} utf8_pattern;
+
+// The patterns for leading bytes of a UTF-8 codepoint encoding
+// Each pattern represents the leading byte for a character encoded with N UTF-8 bytes,
+// where N is the index + 1
+static const utf8_pattern utf8_leading_bytes[] =
+{
+    { 0x80, 0x00 }, // 0xxxxxxx
+    { 0xE0, 0xC0 }, // 110xxxxx
+    { 0xF0, 0xE0 }, // 1110xxxx
+    { 0xF8, 0xF0 }  // 11110xxx
+};
+
+// The number of elements in utf8_leading_bytes
+#define UTF8_LEADING_BYTES_LEN 4
+
+
+// Gets a codepoint from a UTF-16 string
+// utf16: The UTF-16 string
+// len: The length of the UTF-16 string, in UTF-16 characters
+// index:
+// A pointer to the current index on the string.
+// When the function returns, this will be left at the index of the last character
+// that composes the returned codepoint.
+// For surrogate pairs, this means the index will be left at the low surrogate.
+static codepoint_t decode_utf16(utf16_t const* utf16, size_t len, size_t* index)
+{
+    utf16_t high = utf16[*index];
+
+    // BMP character
+    if ((high & GENERIC_SURROGATE_MASK) != GENERIC_SURROGATE_VALUE)
+        return high; 
+
+    // Unmatched low surrogate, invalid
+    if ((high & SURROGATE_MASK) != HIGH_SURROGATE_VALUE)
+        return INVALID_CODEPOINT;
+
+    // String ended with an unmatched high surrogate, invalid
+    if (*index == len - 1)
+        return INVALID_CODEPOINT;
+    
+    utf16_t low = utf16[*index + 1];
+
+    // Unmatched high surrogate, invalid
+    if ((low & SURROGATE_MASK) != LOW_SURROGATE_VALUE)
+        return INVALID_CODEPOINT;
+
+    // Two correctly matched surrogates, increase index to indicate we've consumed
+    // two characters
+    (*index)++;
+
+    // The high bits of the codepoint are the value bits of the high surrogate
+    // The low bits of the codepoint are the value bits of the low surrogate
+    codepoint_t result = high & SURROGATE_CODEPOINT_MASK;
+    result <<= SURROGATE_CODEPOINT_BITS;
+    result |= low & SURROGATE_CODEPOINT_MASK;
+    result += SURROGATE_CODEPOINT_OFFSET;
+    
+    // And if all else fails, it's valid
+    return result;
+}
+
+// Calculates the number of UTF-8 characters it would take to encode a codepoint
+// The codepoint won't be checked for validity, that should be done beforehand.
+static int calculate_utf8_len(codepoint_t codepoint)
+{
+    // An array with the max values would be more elegant, but a bit too heavy
+    // for this common function
+
+    if (codepoint <= UTF8_1_MAX)
+        return 1;
+
+    if (codepoint <= UTF8_2_MAX)
+        return 2;
+
+    if (codepoint <= UTF8_3_MAX)
+        return 3;
+
+    return 4;
+}
+
+// Encodes a codepoint in a UTF-8 string.
+// The codepoint won't be checked for validity, that should be done beforehand.
+//
+// codepoint: The codepoint to be encoded.
+// utf8: The UTF-8 string
+// len: The length of the UTF-8 string, in UTF-8 characters
+// index: The first empty index on the string.
+//
+// return: The number of characters written to the string.
+static size_t encode_utf8(codepoint_t codepoint, utf8_t* utf8, size_t len, size_t index)
+{
+    int size = calculate_utf8_len(codepoint);
+
+    // Not enough space left on the string
+    if (index + size > len)
+        return 0;
+
+    // Write the continuation bytes in reverse order first
+    for (int cont_index = size - 1; cont_index > 0; cont_index--)
+    {
+        utf8_t cont = codepoint & ~UTF8_CONTINUATION_MASK;
+        cont |= UTF8_CONTINUATION_VALUE;
+
+        utf8[index + cont_index] = cont;
+        codepoint >>= UTF8_CONTINUATION_CODEPOINT_BITS;
+    }
+
+    // Write the leading byte
+    utf8_pattern pattern = utf8_leading_bytes[size - 1];
+
+    utf8_t lead = codepoint & ~(pattern.mask);
+    lead |= pattern.value;
+
+    utf8[index] = lead;
+
+    return size;
+}
+
+size_t utf16_to_utf8(utf16_t const* utf16, size_t utf16_len, utf8_t* utf8, size_t utf8_len)
+{
+    // The next codepoint that will be written in the UTF-8 string
+    // or the size of the required buffer if utf8 is NULL
+    size_t utf8_index = 0;
+
+    for (size_t utf16_index = 0; utf16_index < utf16_len; utf16_index++)
+    {
+        codepoint_t codepoint = decode_utf16(utf16, utf16_len, &utf16_index);
+
+        if (utf8 == NULL)
+            utf8_index += calculate_utf8_len(codepoint);
+        else
+            utf8_index += encode_utf8(codepoint, utf8, utf8_len, utf8_index);
+    }
+
+    return utf8_index;
+}
+
+// Gets a codepoint from a UTF-8 string
+// utf8: The UTF-8 string
+// len: The length of the UTF-8 string, in UTF-8 characters
+// index:
+// A pointer to the current index on the string.
+// When the function returns, this will be left at the index of the last character
+// that composes the returned codepoint.
+// For example, for a 3-byte codepoint, the index will be left at the third character.
+static codepoint_t decode_utf8(utf8_t const* utf8, size_t len, size_t* index)
+{
+    utf8_t leading = utf8[*index];
+
+    // The number of bytes that are used to encode the codepoint
+    int encoding_len = 0;
+    // The pattern of the leading byte
+    utf8_pattern leading_pattern;
+    // If the leading byte matches the current leading pattern
+    int matches = 0;
+    
+    do
+    {
+        encoding_len++;
+        leading_pattern = utf8_leading_bytes[encoding_len - 1];
+
+        matches = ((leading & leading_pattern.mask) == leading_pattern.value);
+
+    } while (!matches && encoding_len < UTF8_LEADING_BYTES_LEN);
+
+    // Leading byte doesn't match any known pattern, consider it invalid
+    if (!matches)
+        return INVALID_CODEPOINT;
+
+    codepoint_t codepoint = leading & ~leading_pattern.mask;
+
+    for (int i = 0; i < encoding_len - 1; i++)
+    {
+        // String ended before all continuation bytes were found
+        // Invalid encoding
+        if (*index + 1 >= len)
+            return INVALID_CODEPOINT;
+
+        utf8_t continuation = utf8[*index + 1];
+
+        // Number of continuation bytes not the same as advertised on the leading byte
+        // Invalid encoding
+        if ((continuation & UTF8_CONTINUATION_MASK) != UTF8_CONTINUATION_VALUE)
+            return INVALID_CODEPOINT;
+
+        codepoint <<= UTF8_CONTINUATION_CODEPOINT_BITS;
+        codepoint |= continuation & ~UTF8_CONTINUATION_MASK;
+
+        (*index)++;
+    }
+
+    int proper_len = calculate_utf8_len(codepoint);
+
+    // Overlong encoding: too many bytes were used to encode a short codepoint
+    // Invalid encoding
+    if (proper_len != encoding_len)
+        return INVALID_CODEPOINT;
+
+    // Surrogates are invalid Unicode codepoints, and should only be used in UTF-16
+    // Invalid encoding
+    if (codepoint < BMP_END && (codepoint & GENERIC_SURROGATE_MASK) == GENERIC_SURROGATE_VALUE)
+        return INVALID_CODEPOINT;
+
+    // UTF-8 can encode codepoints larger than the Unicode standard allows
+    // Invalid encoding
+    if (codepoint > UNICODE_MAX)
+        return INVALID_CODEPOINT;
+
+    return codepoint;
+}
+
+// Calculates the number of UTF-16 characters it would take to encode a codepoint
+// The codepoint won't be checked for validity, that should be done beforehand.
+static int calculate_utf16_len(codepoint_t codepoint)
+{
+    if (codepoint <= BMP_END)
+        return 1;
+
+    return 2;
+}
+
+// Encodes a codepoint in a UTF-16 string.
+// The codepoint won't be checked for validity, that should be done beforehand.
+//
+// codepoint: The codepoint to be encoded.
+// utf16: The UTF-16 string
+// len: The length of the UTF-16 string, in UTF-16 characters
+// index: The first empty index on the string.
+//
+// return: The number of characters written to the string.
+static size_t encode_utf16(codepoint_t codepoint, utf16_t* utf16, size_t len, size_t index)
+{
+    // Not enough space on the string
+    if (index >= len)
+        return 0;
+
+    if (codepoint <= BMP_END)
+    {
+        utf16[index] = codepoint;
+        return 1;
+    }
+
+    // Not enough space on the string for two surrogates
+    if (index + 1 >= len)
+        return 0;
+
+    codepoint -= SURROGATE_CODEPOINT_OFFSET;
+
+    utf16_t low = LOW_SURROGATE_VALUE;
+    low |= codepoint & SURROGATE_CODEPOINT_MASK;
+
+    codepoint >>= SURROGATE_CODEPOINT_BITS;
+
+    utf16_t high = HIGH_SURROGATE_VALUE;
+    high |= codepoint & SURROGATE_CODEPOINT_MASK;
+
+    utf16[index] = high;
+    utf16[index + 1] = low;
+
+    return 2;
+}
+
+size_t utf8_to_utf16(const unsigned char * utf8, size_t utf8_len, unsigned short* utf16, size_t utf16_len)
+{
+    // The next codepoint that will be written in the UTF-16 string
+    // or the size of the required buffer if utf16 is NULL
+    size_t utf16_index = 0;
+
+    for (size_t utf8_index = 0; utf8_index < utf8_len; utf8_index++)
+    {
+        codepoint_t codepoint = decode_utf8(utf8, utf8_len, &utf8_index);
+
+        if (utf16 == NULL)
+            utf16_index += calculate_utf16_len(codepoint);
+        else
+            utf16_index += encode_utf16(codepoint, utf16, utf16_len, utf16_index);
+    }
+
+    return utf16_index;
+}
--- a/Plugson/src/Core/ventoy_util.c
+++ b/Plugson/src/Core/ventoy_util.c
@@ -22,7 +22,6 @@
 #include <ventoy_define.h>
 #include <ventoy_util.h>

-
 static int g_tar_filenum = 0;
 static char *g_tar_buffer = NULL;
 static ventoy_file *g_tar_filelist = NULL;
--- a/Plugson/src/Core/ventoy_util.h
+++ b/Plugson/src/Core/ventoy_util.h
@@ -200,6 +200,7 @@ extern int g_unxz_len;
 void unxz_error(char *x);
 int unxz_flush(void *src, unsigned int size);
 char * ventoy_base64_encode(const char *data, int input_length, int *output_length);
+size_t utf8_to_utf16(const unsigned char * utf8, size_t utf8_len, unsigned short* utf16, size_t utf16_len);

 #endif /* __VENTOY_UTIL_H__ */

--- a/Plugson/src/Web/ventoy_http.c
+++ b/Plugson/src/Web/ventoy_http.c
@@ -3673,24 +3673,47 @@ static int ventoy_api_injection_del(struct mg_connection *conn, VTOY_JSON *json)

 static int ventoy_api_preview_json(struct mg_connection *conn, VTOY_JSON *json)
 {
+    int i = 0;
    int pos = 0;
    int len = 0;
-    int encodelen = 0;
+    int utf16enclen = 0;
    char *encodebuf = NULL;
+    unsigned short *utf16buf = NULL;
    
    (void)json;

-    len = ventoy_data_real_save_all(0);
-    encodebuf = ventoy_base64_encode(JSON_SAVE_BUFFER, len, &encodelen);
-    encodebuf[encodelen] = 0;
+    /* We can not use json directly, because it will be formated in the JS. */

+    len = ventoy_data_real_save_all(0);
+
+    utf16buf = (unsigned short *)malloc(2 * len + 16);    
+    if (!utf16buf)
+    {
+        goto json;
+    }
+
+    utf16enclen = utf8_to_utf16((unsigned char *)JSON_SAVE_BUFFER, len, utf16buf, len + 2);
+
+    encodebuf = (char *)malloc(utf16enclen * 4 + 16);
+    if (!encodebuf)
+    {
+        goto json;
+    }
+
+    for (i = 0; i < utf16enclen; i++)
+    {
+        scnprintf(encodebuf + i * 4, 5, "%04X", utf16buf[i]);
+    }
+
+json:
    VTOY_JSON_FMT_BEGIN(pos, JSON_BUFFER, JSON_BUF_MAX);
    VTOY_JSON_FMT_OBJ_BEGIN();
-    VTOY_JSON_FMT_STRN("json", encodebuf);
+    VTOY_JSON_FMT_STRN("json", (encodebuf ? encodebuf : ""));
    VTOY_JSON_FMT_OBJ_END();
    VTOY_JSON_FMT_END(pos);

-    free(encodebuf);
+    CHECK_FREE(encodebuf);
+    CHECK_FREE(utf16buf);

    ventoy_json_buffer(conn, JSON_BUFFER, pos);
    return 0;
@@ -3983,6 +4006,11 @@ static int ventoy_parse_control(VTOY_JSON *json, void *p)
        if (node->enDataType == JSON_TYPE_OBJECT)
        {
            child = node->pstChild;
+            
+            if (child->enDataType != JSON_TYPE_STRING)
+            {
+                continue;
+            }

            if (strcmp(child->pcName, "VTOY_DEFAULT_MENU_MODE") == 0)
            {
--- a/Plugson/vs/VentoyPlugson/Release/VentoyPlugson.exe
+++ b/Plugson/vs/VentoyPlugson/Release/VentoyPlugson.exe
--- a/Plugson/vs/VentoyPlugson/VentoyPlugson/VentoyPlugson.vcxproj
+++ b/Plugson/vs/VentoyPlugson/VentoyPlugson/VentoyPlugson.vcxproj
@@ -99,6 +99,7 @@
    <ClCompile Include="..\..\..\src\Core\ventoy_json.c" />
    <ClCompile Include="..\..\..\src\Core\ventoy_log.c" />
    <ClCompile Include="..\..\..\src\Core\ventoy_md5.c" />
+    <ClCompile Include="..\..\..\src\Core\ventoy_utf.c" />
    <ClCompile Include="..\..\..\src\Core\ventoy_util.c" />
    <ClCompile Include="..\..\..\src\Core\ventoy_util_windows.c" />
    <ClCompile Include="..\..\..\src\Lib\fat_io_lib\fat_access.c" />
--- a/Plugson/vs/VentoyPlugson/VentoyPlugson/VentoyPlugson.vcxproj.filters
+++ b/Plugson/vs/VentoyPlugson/VentoyPlugson/VentoyPlugson.vcxproj.filters
@@ -75,6 +75,9 @@
    <ClCompile Include="..\..\..\src\main_windows.c">
      <Filter>源文件</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\..\src\Core\ventoy_utf.c">
+      <Filter>源文件</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\..\src\Core\ventoy_define.h">
--- a/Plugson/www/buildtime
+++ b/Plugson/www/buildtime
@@ -1 +1 @@
-20221021 14:42:35
+20221117 18:12:12
--- a/Plugson/www/index.html
+++ b/Plugson/www/index.html
@@ -735,7 +735,7 @@
                            
                        </div>
                        <div class="modal-body">
-                            <textarea id="pre_json_preview" class="form-control" rows="30" style="font-family:Menlo,Monaco,Consolas,'Courier New',monospace"></textarea>
+                            <textarea id="pre_json_preview" class="form-control" rows="30" style="font-family:Menlo,Monaco,Consolas,'Courier New',monospace" spellcheck="false"></textarea>
                        </div>
                        <div class="modal-footer">
                            <button id="PrewForm_lang_2" type="button" class="btn btn-primary btn-flat" data-dismiss="modal">确定</button>
@@ -757,7 +757,7 @@

        <footer class="main-footer">
            <div class="pull-right hidden-xs">
-                <b id="plugson_build_date">20221021 14:42:35</b>
+                <b id="plugson_build_date">20221117 18:12:12</b>
            </div>
            <strong><a href="https://www.ventoy.net" target="_blank">https://www.ventoy.net</a></strong>
        </footer>
@@ -819,7 +819,7 @@
            $('#JsonPeviewForm #JsonPreviewForm_lang_1').text(g_vtoy_cur_language.STR_JSON_PREVIEW);
            $('#JsonPeviewForm #PrewForm_lang_2').text(g_vtoy_cur_language.STR_BTN_OK);
            
-            $('#pre_json_preview').text(atob(data.json));
+            $('#pre_json_preview').text(VtoyUTF16HexToAscii(data.json));            
            $("#JsonPreviewModal").modal();
        });
    }
--- a/Plugson/www/static/js/vtoy.js
+++ b/Plugson/www/static/js/vtoy.js
@@ -1,4 +1,13 @@

+function VtoyUTF16HexToAscii(hex) {
+    var str = "";        
+    for (var i = 0; i < hex.length; i += 4) {
+        str += String.fromCharCode(parseInt(hex.substring(i, i + 4), 16));
+    }
+    
+    return str;
+}
+
 function ventoy_replace_slash(str) {
    var str1 = str.replace(/\\/g, '/');
    var str2 = str1.replace(/\/\//g, '/');