Код (Text):
// unicode.c #include <stdio.h> #include <string.h> #include <stdint.h> #include <stdlib.h> #include <io.h> #include "unicode.h" int isUTF8(const char *data, size_t size) { const uint8_t *str = (uint8_t*)data; const uint8_t *end = str + size; uint8_t byte; uint32_t code_length, i; uint32_t ch; while (str != end) { byte = *str; if (byte <= 0x7F) { /* 1 byte sequence: U+0000..U+007F */ str += 1; continue; } if (0xC2 <= byte && byte <= 0xDF) /* 0b110xxxxx: 2 bytes sequence */ code_length = 2; else if (0xE0 <= byte && byte <= 0xEF) /* 0b1110xxxx: 3 bytes sequence */ code_length = 3; else if (0xF0 <= byte && byte <= 0xF4) /* 0b11110xxx: 4 bytes sequence */ code_length = 4; else { /* invalid first byte of a multibyte character */ return 0; } if (str + (code_length - 1) >= end) { /* truncated string or invalid byte sequence */ return 0; } /* Check continuation bytes: bit 7 should be set, bit 6 should be * unset (b10xxxxxx). */ for (i=1; i < code_length; i++) { if ((str[i] & 0xC0) != 0x80) return 0; } if (code_length == 2) { /* 2 bytes sequence: U+0080..U+07FF */ ch = ((str[0] & 0x1f) << 6) + (str[1] & 0x3f); /* str[0] >= 0xC2, so ch >= 0x0080. str[0] <= 0xDF, (str[1] & 0x3f) <= 0x3f, so ch <= 0x07ff */ } else if (code_length == 3) { /* 3 bytes sequence: U+0800..U+FFFF */ ch = ((str[0] & 0x0f) << 12) + ((str[1] & 0x3f) << 6) + (str[2] & 0x3f); /* (0xff & 0x0f) << 12 | (0xff & 0x3f) << 6 | (0xff & 0x3f) = 0xffff, so ch <= 0xffff */ if (ch < 0x0800) return 0; /* surrogates (U+D800-U+DFFF) are invalid in UTF-8: test if (0xD800 <= ch && ch <= 0xDFFF) */ if ((ch >> 11) == 0x1b) return 0; } else if (code_length == 4) { /* 4 bytes sequence: U+10000..U+10FFFF */ ch = ((str[0] & 0x07) << 18) + ((str[1] & 0x3f) << 12) + ((str[2] & 0x3f) << 6) + (str[3] & 0x3f); if ((ch < 0x10000) || (0x10FFFF < ch)) return 0; } str += code_length; } return 1; } uint32_t Utf32Get(int bigEndian, const char *src) { uint32_t codePoint; if (bigEndian) { codePoint = ((uint8_t)src[0] << 24) | ((uint8_t)src[1] << 16) | ((uint8_t)src[2] << 8) | (uint8_t)src[3]; } else { codePoint = ((uint8_t)src[3] << 24) | ((uint8_t)src[2] << 16) | ((uint8_t)src[1] << 8) | (uint8_t)src[0]; } return codePoint; } int Utf8Put(int32_t codePoint, char *dst, size_t *numUnits) { if (numUnits != NULL) *numUnits = 0; if ((codePoint < 0) || (0x10FFFF < codePoint)) return -1; if (dst == NULL) return -1; // One-octet code point (Unicode range U+000000 to U+00007F)? if (codePoint < 0x80) { dst[0] = codePoint & 0x7F; if (numUnits != NULL) *numUnits = 1; } // Two-octet code point (Unicode range U+000080 to U+0007FF)? else if (codePoint < 0x800) { dst[0] = 0xC0 | ((codePoint & 0x07C0) >> 6); dst[1] = 0x80 | (codePoint & 0x003F); if (numUnits != NULL) *numUnits = 2; } // Three-octet code point (Unicode range U+000800 to U+00FFFF)? else if (codePoint < 0x10000) { dst[0] = 0xE0 | ((codePoint & 0x0F000) >> 12); dst[1] = 0x80 | ((codePoint & 0x00FC0) >> 6); dst[2] = 0x80 | (codePoint & 0x0003F); if (numUnits != NULL) *numUnits = 3; } // Four-octet code point (Unicode range U+010000 to U+10FFFF)? else { dst[0] = 0xF0 | ((codePoint & 0x1C0000) >> 18); dst[1] = 0x80 | ((codePoint & 0x03F000) >> 12); dst[2] = 0x80 | ((codePoint & 0x000FC0) >> 6); dst[3] = 0x80 | (codePoint & 0x00003F); if (numUnits != NULL) *numUnits = 4; } return 0; } int Utf32Bom(const char *src) { if (((uint8_t)src[0] == 0) && ((uint8_t)src[1] == 0) && ((uint8_t)src[2] == 0xFE) && ((uint8_t)src[3] == 0xFF)) { return 1; // BOM, big-endian } else if (((uint8_t)src[0] == 0xFF) && ((uint8_t)src[1] == 0xFE) && ((uint8_t)src[2] == 0) && ((uint8_t)src[3] == 0)) { return -1; // BOM, little-endian } return 0; // No BOM } size_t Utf32Utf8(const char *src, int srclen, int bom, char *dst, int dstlen) { int bigEndian = 0; int32_t codePoint; size_t length, numUnits; length = 0; // Determine the byte order of the UTF-32 source string. if(bom == UTF32BENB){ // UTF-32BE NoBOM bigEndian = 1; } if(bom == UTF32LENB){ // UTF-32LE NoBOM bigEndian = 0; } if(bom != UTF32LENB && bom != UTF32BENB){ // if the bom has not been supplied bigEndian = (Utf32Bom(src) >= 0); if (bigEndian) { src += sizeof(uint32_t); srclen -= sizeof(uint32_t); } } // Encode the UTF-32 code points as UTF-8. while (srclen > 0) { codePoint = Utf32Get(bigEndian, src); if (codePoint < 0) { return -1; } src += sizeof(uint32_t); srclen -= sizeof(uint32_t); if ((dstlen < 4) && ((codePoint > 0x0FFFF) || ((dstlen < 3) && ((codePoint > 0x00800) || ((dstlen < 2) && ((codePoint > 0x0007F) || (dstlen < 1))))))) { return -1; } if (Utf8Put(codePoint, dst, &numUnits)) { return -1; } dst += numUnits; dstlen -= numUnits; length += numUnits; } if (dstlen >= 1) *dst = '\0'; return length; } int IsAnsi(const char *src, int len) { int ret = 0; char c; while (--len) { c = *src; // in ANSI range if (c > 0x19 && c < 0x7F || c == 0x0A || c == 0x0D || c == 0x9){ ret = 1; src++; } else{ ret = 0; break; // not in range of ANSI chars } } return ret; } int IsAscii(const char *data, size_t size) { const uint8_t *str = (const uint8_t *)data; const uint8_t *end = str + size; for (; str != end; str++) { if (*str >= 0x80) return 0; } return 1; } void CheckUtf16LE(const wchar_t * wstr, int * cntLE, int * cntHigh) { while(*wstr){ if(*wstr == 0x000D) (*cntLE)++; if((char)*wstr > (char)((*wstr)>>8)) (*cntHigh)++; wstr++; } } void CheckUtf32LE(const uint32_t * wstr, int * cntLE, int * cntHigh) { while(*wstr){ if(*wstr == 0x0000000D) (*cntLE)++; if((wchar_t)*wstr > (wchar_t)((*wstr)>>16)) (*cntHigh)++; wstr++; } } void CheckUtf16BE(const wchar_t * wstr, int * cntBE, int * cntHigh) { while(*wstr){ if(*wstr == 0x0D00) (*cntBE)++; if((char)((*wstr)>>8) > (char)*wstr) (*cntHigh)++; wstr++; } } void CheckUtf32BE(const uint32_t * wstr, int * cntBE, int * cntHigh) { while(*wstr){ if(*wstr == 0x0D000000) (*cntBE)++; if((wchar_t)((*wstr)>>16) > (wchar_t)*wstr) (*cntHigh)++; wstr++; } } int AdjustUnix(char * src, int len) { int cnt = 0; char tmp[len*2]; char * ptmp = tmp, * psrc = src; // copy src to tmp while inserting CR before LF while(*psrc) { if(*psrc == 0x0A){ *ptmp++ = 0x0D; cnt++; } *ptmp++ = *psrc++; } // copy back to src memcpy(src, tmp, len + cnt); return len + cnt; } int CheckUnix(const char * str) { int i = 0; char c; while(*str) { c = *str++; if(*str == 0x0A) { i++; if(c == 0x0D) i--; } } return i; } int IsvalidUtf8(const char *str, int len, int *oneseq, int *twoseq, int *threeseq, int *fourseq, int *err) { uint8_t c, c1, c2; while(*str) { c = *str++; //********************************************************* // check for one byte ascii //********************************************************* // 1 7F if(c <= 0x7F){ (*oneseq)++; continue; } //********************************************************* // check for two byte sequences //********************************************************* // 2 C2..DF 80..BF if(c >= 0xC2 && c <= 0xDF){ c = *str++; if((c >= 0x80 && c <= 0xBF)){ (*twoseq)++; continue; } } //********************************************************* // check for three byte sequences //********************************************************* // 3 E0 A0..BF 80..BF if(c == 0xE0){ c = *str++; c1 = *str++; if(c >= 0xA0 && c <= 0xBF && c1 >= 0x80 && c1 <= 0xBF){ (*threeseq)++; continue; } } // 3 E1..EC 80..BF 80..BF if(c >= 0xE1 && c <= 0xEC){ c = *str++; c1 = *str++; if(c >= 0x80 && c <= 0xBF && c1 >= 0x80 && c1 <= 0xBF){ (*threeseq)++; continue; } } // 3 ED 80..9F 80..BF if(c == 0xED){ c = *str++; c1 = *str++; if(c >= 0x80 && c <= 0x9F && c1 >= 0x80 && c1 <= 0xBF){ (*threeseq)++; continue; } } // 3 EE..EF 80..BF 80..BF if(c >= 0xEE && c <= 0xEF) { c = *str++; c1 = *str++; if(c >= 0x80 && c <= 0xBF && c1 >= 0x80 || c1 <= 0xBF){ (*threeseq)++; continue; } } //********************************************************* // check for four byte sequences //********************************************************* // 4 F0 90..BF 80..BF 80..BF if(c == 0xF0){ c = *str++; c1 = *str++; c2 = *str++; if(c >= 0x90 && c <= 0xBF && c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF){ (*fourseq)++; continue; } } // 4 F1..F3 80..BF 80..BF 80..BF if(c >= 0xF1 && c <= 0xF3){ c = *str++; c1 = *str++; c2 = *str++; if(c >= 0x80 && c <= 0xBF && c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF){ (*fourseq)++; continue; } } // 4 F4 80..8F 80..BF 80..BF if(c >= 0xF4){ c = *str++; c1 = *str++; c2 = *str++; if(c >= 0x80 && c <= 0x8F && c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF){ (*fourseq)++; continue; } } } return 0; } int CheckBom(const wchar_t *str) { uint32_t uc = *(uint32_t*)str; if(uc == 0x0000FEFF) return UTF32LEB; // Unicode-32LE BOM if(uc == 0xFFFE0000) return UTF32BEB; // Unicode-32LE BOM if (str[0] == 0xFEFF) return UTF16LE; // Unicode-16LE BOM else if (str[0] == 0xFFFE) return UTF16BE; // Unicode-16BE BOM else if (str[0] == 0xBBEF) { if ((uint8_t)str[1] == 0xBF) return UTF8B; // Unicode-UTF8 BOM else return NOBOM; // Unknown - return NOBOM } return NOBOM; } int CheckUtf8(FILE * f) { //************************************************************** // Special case in that a UTF8 file without BOM could be normal // ascii for 20k then have a few lines of unicode so need to // check the whole file. //************************************************************** long len; int err = 0, oneseq = 0, twoseq = 0, threeseq = 0, fourseq = 0; rewind(f); len = _filelength(_fileno(f)); len = (len > 1000000) ? 1000000 : len; // one meg is enough though. char * buffer = malloc(len+10); fread(buffer, len, 1, f); // file is closed in DisplayFileHead() //************************************************************** // Add a few zero's so that the while loop in isvalid_utf8() // can if necessary go pass the end of the text without crashing, // before exiting the loop. The buffer is 10 bytes larger. //************************************************************** memset(&buffer[len], 0, 9); IsvalidUtf8(buffer, len, &oneseq, &twoseq, &threeseq, &fourseq, &err); free(buffer); // guessing here if(len > 1000){ if(twoseq >= 8 || threeseq >= 8 || fourseq >= 8) return 1; }else if(len > 100){ if(twoseq >= 4 || threeseq >= 4 || fourseq >= 4) return 1; }else if(len > 10){ if(twoseq >= 2 || threeseq >= 2 || fourseq >= 2) return 1; }else if(len > 2){ if(twoseq >= 1 || threeseq >= 1 || fourseq >= 1) return 1; } return 0; }
Проблемы с кодировкой и как можно их решить
Дата публикации 7 сен 2024
| Редактировалось 9 сен 2024