/**********************************************************************************************/ /* The MIT License */ /* */ /* Copyright 2016-2017 Twitch Interactive, Inc. or its affiliates. All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to any person obtaining a copy */ /* of this software and associated documentation files (the "Software"), to deal */ /* in the Software without restriction, including without limitation the rights */ /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ /* copies of the Software, and to permit persons to whom the Software is */ /* furnished to do so, subject to the following conditions: */ /* */ /* The above copyright notice and this permission notice shall be included in */ /* all copies or substantial portions of the Software. */ /* */ /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN */ /* THE SOFTWARE. */ /**********************************************************************************************/ #include "utf8.h" #include #include #include const utf8_char_t * utf8_char_next (const utf8_char_t * c) { const utf8_char_t *n = c + utf8_char_length (c); return n == c ? 0 : n; } // returnes the length of the char in bytes size_t utf8_char_length (const utf8_char_t * c) { // count null term as zero size if (!c || 0x00 == c[0]) { return 0; } static const size_t _utf8_char_length[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 }; return _utf8_char_length[(c[0] >> 3) & 0x1F]; } int utf8_char_whitespace (const utf8_char_t * c) { // 0x7F is DEL if (!c || (c[0] >= 0 && c[0] <= ' ') || c[0] == 0x7F) { return 1; } // EIA608_CHAR_NO_BREAK_SPACE TODO other utf8 spaces if (0xC2 == (unsigned char) c[0] && 0xA0 == (unsigned char) c[1]) { return 1; } return 0; } // returns length of the string in bytes // size is number of charcter to count (0 to count until NULL term) size_t utf8_string_length (const utf8_char_t * data, utf8_size_t size) { size_t char_length, byts = 0; if (0 == size) { size = utf8_char_count (data, 0); } for (; 0 < size; --size) { if (0 == (char_length = utf8_char_length (data))) { break; } data += char_length; byts += char_length; } return byts; } size_t utf8_char_copy (utf8_char_t * dst, const utf8_char_t * src) { size_t bytes = utf8_char_length (src); if (bytes && dst) { memcpy (dst, src, bytes); dst[bytes] = '\0'; } return bytes; } // returnes the number of utf8 charcters in a string given the number of bytes // to count until the a null terminator, pass 0 for size utf8_size_t utf8_char_count (const char *data, size_t size) { size_t i, bytes = 0; utf8_size_t count = 0; if (0 == size) { size = strlen (data); } for (i = 0; i < size; ++count, i += bytes) { if (0 == (bytes = utf8_char_length (&data[i]))) { break; } } return count; } // returns the length of the line in bytes triming not printable charcters at the end size_t utf8_trimmed_length (const utf8_char_t * data, utf8_size_t charcters) { size_t l, t = 0, split_at = 0; for (size_t c = 0; (*data) && c < charcters; ++c) { l = utf8_char_length (data); if (!utf8_char_whitespace (data)) { split_at = t + l; } t += l, data += l; } return split_at; } size_t _utf8_newline (const utf8_char_t * data) { if ('\r' == data[0]) { return '\n' == data[1] ? 2 : 1; // windows/unix } else if ('\n' == data[0]) { return '\r' == data[1] ? 2 : 1; // riscos/macos } else { return 0; } } // returns the length in bytes of the line including the new line charcter(s) // auto detects between windows(CRLF), unix(LF), mac(CR) and riscos (LFCR) line endings size_t utf8_line_length (const utf8_char_t * data) { size_t n, len = 0; for (len = 0; 0 != data[len]; ++len) { if (0 < (n = _utf8_newline (data))) { return len + n; } data += utf8_char_length (data); } return len; } // returns number of chars to include before split utf8_size_t utf8_wrap_length (const utf8_char_t * data, utf8_size_t size) { // Set split_at to size, so if a split point cna not be found, retuns the size passed in size_t char_length, char_count, split_at = size; for (char_count = 0; char_count <= size; ++char_count) { if (_utf8_newline (data)) { return char_count; } else if (utf8_char_whitespace (data)) { split_at = char_count; } char_length = utf8_char_length (data); data += char_length; } return split_at; } int utf8_line_count (const utf8_char_t * data) { size_t len = 0; int count = 0; do { len = utf8_line_length (data); data += len; ++count; } while (0 < len); return count - 1; } utf8_char_t * utf8_load_text_file (const char *path, size_t * size) { utf8_char_t *data = NULL; FILE *file = fopen (path, "r"); if (file) { fseek (file, 0, SEEK_END); size_t file_size = ftell (file); fseek (file, 0, SEEK_SET); if (0 == (*size) || file_size <= (*size)) { (*size) = 0; data = (utf8_char_t *) malloc (1 + file_size); memset (data, '\0', file_size); if (data) { utf8_char_t *pos = data; size_t bytes_read = 0; while (0 < (bytes_read = fread (pos, 1, file_size - (*size), file))) { pos += bytes_read; (*size) += bytes_read; } } fclose (file); } } data[*size] = 0; return data; } #ifndef strnstr char * strnstr (const char *string1, const char *string2, size_t len) { size_t length2; length2 = strlen (string2); if (!length2) { return (char *) string1; } while (len >= length2) { len--; if (!memcmp (string1, string2, length2)) return (char *) string1; string1++; } return NULL; } #endif