1 /**
2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3 *
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2015 Jean-Pierre Andre
7 * Copyright (c) 2008 Bernhard Kaindl
8 *
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28
29 #ifdef HAVE_STDIO_H
30 #include <stdio.h>
31 #endif
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif
38 #ifdef HAVE_STRING_H
39 #include <string.h>
40 #endif
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_LOCALE_H
45 #include <locale.h>
46 #endif
47
48 #if defined(__APPLE__) || defined(__DARWIN__)
49 #ifdef ENABLE_NFCONV
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
53
54 #include "compat.h"
55 #include "attrib.h"
56 #include "types.h"
57 #include "unistr.h"
58 #include "debug.h"
59 #include "logging.h"
60 #include "misc.h"
61
62 #ifndef ALLOW_BROKEN_UNICODE
63 /* Erik allowing broken UTF-16 surrogate pairs and U+FFFE and U+FFFF by default,
64 * open to debate. */
65 #define ALLOW_BROKEN_UNICODE 1
66 #endif /* !defined(ALLOW_BROKEN_UNICODE) */
67
68 /*
69 * IMPORTANT
70 * =========
71 *
72 * All these routines assume that the Unicode characters are in little endian
73 * encoding inside the strings!!!
74 */
75
76 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
77
78 #if defined(__APPLE__) || defined(__DARWIN__)
79 #ifdef ENABLE_NFCONV
80 /**
81 * This variable controls whether or not automatic normalization form conversion
82 * should be performed when translating NTFS unicode file names to UTF-8.
83 * Defaults to on, but can be controlled from the outside using the function
84 * int ntfs_macosx_normalize_filenames(int normalize);
85 */
86 static int nfconvert_utf8 = 1;
87 #endif /* ENABLE_NFCONV */
88 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
89
90 /*
91 * This is used by the name collation functions to quickly determine what
92 * characters are (in)valid.
93 */
94 #if 0
95 static const u8 legal_ansi_char_array[0x40] = {
96 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
97 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
98
99 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
100 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
101
102 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
103 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
104
105 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
106 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
107 };
108 #endif
109
110 /**
111 * ntfs_names_are_equal - compare two Unicode names for equality
112 * @s1: name to compare to @s2
113 * @s1_len: length in Unicode characters of @s1
114 * @s2: name to compare to @s1
115 * @s2_len: length in Unicode characters of @s2
116 * @ic: ignore case bool
117 * @upcase: upcase table (only if @ic == IGNORE_CASE)
118 * @upcase_size: length in Unicode characters of @upcase (if present)
119 *
120 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
121 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
122 * the @upcase table is used to perform a case insensitive comparison.
123 */
ntfs_names_are_equal(const ntfschar * s1,size_t s1_len,const ntfschar * s2,size_t s2_len,const IGNORE_CASE_BOOL ic,const ntfschar * upcase,const u32 upcase_size)124 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
125 const ntfschar *s2, size_t s2_len,
126 const IGNORE_CASE_BOOL ic,
127 const ntfschar *upcase, const u32 upcase_size)
128 {
129 if (s1_len != s2_len)
130 return FALSE;
131 if (!s1_len)
132 return TRUE;
133 if (ic == CASE_SENSITIVE)
134 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
135 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
136 TRUE;
137 }
138
139 /*
140 * ntfs_names_full_collate() fully collate two Unicode names
141 *
142 * @name1: first Unicode name to compare
143 * @name1_len: length of first Unicode name to compare
144 * @name2: second Unicode name to compare
145 * @name2_len: length of second Unicode name to compare
146 * @ic: either CASE_SENSITIVE or IGNORE_CASE (see below)
147 * @upcase: upcase table
148 * @upcase_len: upcase table size
149 *
150 * If @ic is CASE_SENSITIVE, then the names are compared primarily ignoring
151 * case, but if the names are equal ignoring case, then they are compared
152 * case-sensitively. As an example, "abc" would collate before "BCD" (since
153 * "abc" and "BCD" differ ignoring case and 'A' < 'B') but after "ABC" (since
154 * "ABC" and "abc" are equal ignoring case and 'A' < 'a'). This matches the
155 * collation order of filenames as indexed in NTFS directories.
156 *
157 * If @ic is IGNORE_CASE, then the names are only compared case-insensitively
158 * and are considered to match if and only if they are equal ignoring case.
159 *
160 * Returns:
161 * -1 if the first name collates before the second one,
162 * 0 if the names match, or
163 * 1 if the second name collates before the first one
164 */
ntfs_names_full_collate(const ntfschar * name1,const u32 name1_len,const ntfschar * name2,const u32 name2_len,const IGNORE_CASE_BOOL ic,const ntfschar * upcase,const u32 upcase_len)165 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
166 const ntfschar *name2, const u32 name2_len,
167 const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
168 const u32 upcase_len)
169 {
170 u32 cnt;
171 u16 c1, c2;
172 u16 u1, u2;
173
174 #ifdef DEBUG
175 if (!name1 || !name2 || !upcase || !upcase_len) {
176 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
177 exit(1);
178 }
179 #endif
180 cnt = min(name1_len, name2_len);
181 if (cnt > 0) {
182 if (ic == CASE_SENSITIVE) {
183 while (--cnt && (*name1 == *name2)) {
184 name1++;
185 name2++;
186 }
187 u1 = c1 = le16_to_cpu(*name1);
188 u2 = c2 = le16_to_cpu(*name2);
189 if (u1 < upcase_len)
190 u1 = le16_to_cpu(upcase[u1]);
191 if (u2 < upcase_len)
192 u2 = le16_to_cpu(upcase[u2]);
193 if ((u1 == u2) && cnt)
194 do {
195 name1++;
196 u1 = le16_to_cpu(*name1);
197 name2++;
198 u2 = le16_to_cpu(*name2);
199 if (u1 < upcase_len)
200 u1 = le16_to_cpu(upcase[u1]);
201 if (u2 < upcase_len)
202 u2 = le16_to_cpu(upcase[u2]);
203 } while ((u1 == u2) && --cnt);
204 if (u1 < u2)
205 return -1;
206 if (u1 > u2)
207 return 1;
208 if (name1_len < name2_len)
209 return -1;
210 if (name1_len > name2_len)
211 return 1;
212 if (c1 < c2)
213 return -1;
214 if (c1 > c2)
215 return 1;
216 } else {
217 do {
218 u1 = le16_to_cpu(*name1);
219 name1++;
220 u2 = le16_to_cpu(*name2);
221 name2++;
222 if (u1 < upcase_len)
223 u1 = le16_to_cpu(upcase[u1]);
224 if (u2 < upcase_len)
225 u2 = le16_to_cpu(upcase[u2]);
226 } while ((u1 == u2) && --cnt);
227 if (u1 < u2)
228 return -1;
229 if (u1 > u2)
230 return 1;
231 if (name1_len < name2_len)
232 return -1;
233 if (name1_len > name2_len)
234 return 1;
235 }
236 } else {
237 if (name1_len < name2_len)
238 return -1;
239 if (name1_len > name2_len)
240 return 1;
241 }
242 return 0;
243 }
244
245 /**
246 * ntfs_ucsncmp - compare two little endian Unicode strings
247 * @s1: first string
248 * @s2: second string
249 * @n: maximum unicode characters to compare
250 *
251 * Compare the first @n characters of the Unicode strings @s1 and @s2,
252 * The strings in little endian format and appropriate le16_to_cpu()
253 * conversion is performed on non-little endian machines.
254 *
255 * The function returns an integer less than, equal to, or greater than zero
256 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
257 * to be less than, to match, or be greater than @s2.
258 */
ntfs_ucsncmp(const ntfschar * s1,const ntfschar * s2,size_t n)259 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
260 {
261 u16 c1, c2;
262 size_t i;
263
264 #ifdef DEBUG
265 if (!s1 || !s2) {
266 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
267 exit(1);
268 }
269 #endif
270 for (i = 0; i < n; ++i) {
271 c1 = le16_to_cpu(s1[i]);
272 c2 = le16_to_cpu(s2[i]);
273 if (c1 < c2)
274 return -1;
275 if (c1 > c2)
276 return 1;
277 if (!c1)
278 break;
279 }
280 return 0;
281 }
282
283 /**
284 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
285 * @s1: first string
286 * @s2: second string
287 * @n: maximum unicode characters to compare
288 * @upcase: upcase table
289 * @upcase_size: upcase table size in Unicode characters
290 *
291 * Compare the first @n characters of the Unicode strings @s1 and @s2,
292 * ignoring case. The strings in little endian format and appropriate
293 * le16_to_cpu() conversion is performed on non-little endian machines.
294 *
295 * Each character is uppercased using the @upcase table before the comparison.
296 *
297 * The function returns an integer less than, equal to, or greater than zero
298 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
299 * to be less than, to match, or be greater than @s2.
300 */
ntfs_ucsncasecmp(const ntfschar * s1,const ntfschar * s2,size_t n,const ntfschar * upcase,const u32 upcase_size)301 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
302 const ntfschar *upcase, const u32 upcase_size)
303 {
304 u16 c1, c2;
305 size_t i;
306
307 #ifdef DEBUG
308 if (!s1 || !s2 || !upcase) {
309 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
310 exit(1);
311 }
312 #endif
313 for (i = 0; i < n; ++i) {
314 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
315 c1 = le16_to_cpu(upcase[c1]);
316 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
317 c2 = le16_to_cpu(upcase[c2]);
318 if (c1 < c2)
319 return -1;
320 if (c1 > c2)
321 return 1;
322 if (!c1)
323 break;
324 }
325 return 0;
326 }
327
328 /**
329 * ntfs_ucsnlen - determine the length of a little endian Unicode string
330 * @s: pointer to Unicode string
331 * @maxlen: maximum length of string @s
332 *
333 * Return the number of Unicode characters in the little endian Unicode
334 * string @s up to a maximum of maxlen Unicode characters, not including
335 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
336 * and @s + @maxlen, @maxlen is returned.
337 *
338 * This function never looks beyond @s + @maxlen.
339 */
ntfs_ucsnlen(const ntfschar * s,u32 maxlen)340 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
341 {
342 u32 i;
343
344 for (i = 0; i < maxlen; i++) {
345 if (!le16_to_cpu(s[i]))
346 break;
347 }
348 return i;
349 }
350
351 /**
352 * ntfs_ucsndup - duplicate little endian Unicode string
353 * @s: pointer to Unicode string
354 * @maxlen: maximum length of string @s
355 *
356 * Return a pointer to a new little endian Unicode string which is a duplicate
357 * of the string s. Memory for the new string is obtained with ntfs_malloc(3),
358 * and can be freed with free(3).
359 *
360 * A maximum of @maxlen Unicode characters are copied and a terminating
361 * (ntfschar)'\0' little endian Unicode character is added.
362 *
363 * This function never looks beyond @s + @maxlen.
364 *
365 * Return a pointer to the new little endian Unicode string on success and NULL
366 * on failure with errno set to the error code.
367 */
ntfs_ucsndup(const ntfschar * s,u32 maxlen)368 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
369 {
370 ntfschar *dst;
371 u32 len;
372
373 len = ntfs_ucsnlen(s, maxlen);
374 dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
375 if (dst) {
376 memcpy(dst, s, len * sizeof(ntfschar));
377 dst[len] = const_cpu_to_le16(L'\0');
378 }
379 return dst;
380 }
381
382 /**
383 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
384 * @name:
385 * @name_len:
386 * @upcase:
387 * @upcase_len:
388 *
389 * Description...
390 *
391 * Returns:
392 */
ntfs_name_upcase(ntfschar * name,u32 name_len,const ntfschar * upcase,const u32 upcase_len)393 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
394 const u32 upcase_len)
395 {
396 u32 i;
397 u16 u;
398
399 for (i = 0; i < name_len; i++)
400 if ((u = le16_to_cpu(name[i])) < upcase_len)
401 name[i] = upcase[u];
402 }
403
404 /**
405 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
406 */
ntfs_name_locase(ntfschar * name,u32 name_len,const ntfschar * locase,const u32 locase_len)407 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
408 const u32 locase_len)
409 {
410 u32 i;
411 u16 u;
412
413 if (locase)
414 for (i = 0; i < name_len; i++)
415 if ((u = le16_to_cpu(name[i])) < locase_len)
416 name[i] = locase[u];
417 }
418
419 /**
420 * ntfs_file_value_upcase - Convert a filename to upper case
421 * @file_name_attr:
422 * @upcase:
423 * @upcase_len:
424 *
425 * Description...
426 *
427 * Returns:
428 */
ntfs_file_value_upcase(FILE_NAME_ATTR * file_name_attr,const ntfschar * upcase,const u32 upcase_len)429 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
430 const ntfschar *upcase, const u32 upcase_len)
431 {
432 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
433 file_name_attr->file_name_length, upcase, upcase_len);
434 }
435
436 /*
437 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
438 for now]) for path names, but the Unicode code points need to be
439 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
440 glibc does this even without a locale in a hard-coded fashion as that
441 appears to be is easy because the low 7-bit ASCII range appears to be
442 available in all charsets but it does not convert anything if
443 there was some error with the locale setup or none set up like
444 when mount is called during early boot where he (by policy) do
445 not use locales (and may be not available if /usr is not yet mounted),
446 so this patch fixes the resulting issues for systems which use
447 UTF-8 and for others, specifying the locale in fstab brings them
448 the encoding which they want.
449
450 If no locale is defined or there was a problem with setting one
451 up and whenever nl_langinfo(CODESET) returns a sting starting with
452 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
453 the bug where NTFS-3G does not show any path names which include
454 international characters!!! (and also fails on creating them) as result.
455
456 Author: Bernhard Kaindl <bk@suse.de>
457 Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
458 */
459
460 /*
461 * Return the number of bytes in UTF-8 needed (without the terminating null) to
462 * store the given UTF-16LE string.
463 *
464 * On error, -1 is returned, and errno is set to the error code. The following
465 * error codes can be expected:
466 * EILSEQ The input string is not valid UTF-16LE (only possible
467 * if compiled without ALLOW_BROKEN_UNICODE).
468 * ENAMETOOLONG The length of the UTF-8 string in bytes (without the
469 * terminating null) would exceed @outs_len.
470 */
utf16_to_utf8_size(const ntfschar * ins,const int ins_len,int outs_len)471 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
472 {
473 int i, ret = -1;
474 int count = 0;
475 BOOL surrog;
476
477 surrog = FALSE;
478 for (i = 0; i < ins_len && ins[i] && count <= outs_len; i++) {
479 unsigned short c = le16_to_cpu(ins[i]);
480 if (surrog) {
481 if ((c >= 0xdc00) && (c < 0xe000)) {
482 surrog = FALSE;
483 count += 4;
484 } else {
485 #if ALLOW_BROKEN_UNICODE
486 /* The first UTF-16 unit of a surrogate pair has
487 * a value between 0xd800 and 0xdc00. It can be
488 * encoded as an individual UTF-8 sequence if we
489 * cannot combine it with the next UTF-16 unit
490 * unit as a surrogate pair. */
491 surrog = FALSE;
492 count += 3;
493
494 --i;
495 continue;
496 #else
497 goto fail;
498 #endif /* ALLOW_BROKEN_UNICODE */
499 }
500 } else
501 if (c < 0x80)
502 count++;
503 else if (c < 0x800)
504 count += 2;
505 else if (c < 0xd800)
506 count += 3;
507 else if (c < 0xdc00)
508 surrog = TRUE;
509 #if ALLOW_BROKEN_UNICODE
510 else if (c < 0xe000)
511 count += 3;
512 else if (c >= 0xe000)
513 #else
514 else if ((c >= 0xe000) && (c < 0xfffe))
515 #endif /* ALLOW_BROKEN_UNICODE */
516 count += 3;
517 else
518 goto fail;
519 }
520
521 if (surrog && count <= outs_len) {
522 #if ALLOW_BROKEN_UNICODE
523 count += 3; /* ending with a single surrogate */
524 #else
525 goto fail;
526 #endif /* ALLOW_BROKEN_UNICODE */
527 }
528
529 if (count > outs_len) {
530 errno = ENAMETOOLONG;
531 goto out;
532 }
533
534 ret = count;
535 out:
536 return ret;
537 fail:
538 errno = EILSEQ;
539 goto out;
540 }
541
542 /*
543 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
544 * @ins: input utf16 string buffer
545 * @ins_len: length of input string in utf16 characters
546 * @outs: on return contains the (allocated) output multibyte string
547 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
548 *
549 * Return -1 with errno set if string has invalid byte sequence or too long.
550 */
ntfs_utf16_to_utf8(const ntfschar * ins,const int ins_len,char ** outs,int outs_len)551 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
552 char **outs, int outs_len)
553 {
554 #if defined(__APPLE__) || defined(__DARWIN__)
555 #ifdef ENABLE_NFCONV
556 char *original_outs_value = *outs;
557 int original_outs_len = outs_len;
558 #endif /* ENABLE_NFCONV */
559 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
560
561 char *t;
562 int i, size, ret = -1;
563 int halfpair;
564
565 halfpair = 0;
566 if (!*outs) {
567 /* If no output buffer was provided, we will allocate one and
568 * limit its length to PATH_MAX. Note: we follow the standard
569 * convention of PATH_MAX including the terminating null. */
570 outs_len = PATH_MAX;
571 }
572
573 /* The size *with* the terminating null is limited to @outs_len,
574 * so the size *without* the terminating null is limited to one less. */
575 size = utf16_to_utf8_size(ins, ins_len, outs_len - 1);
576
577 if (size < 0)
578 goto out;
579
580 if (!*outs) {
581 outs_len = size + 1;
582 *outs = ntfs_malloc(outs_len);
583 if (!*outs)
584 goto out;
585 }
586
587 t = *outs;
588
589 for (i = 0; i < ins_len && ins[i]; i++) {
590 unsigned short c = le16_to_cpu(ins[i]);
591 /* size not double-checked */
592 if (halfpair) {
593 if ((c >= 0xdc00) && (c < 0xe000)) {
594 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
595 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
596 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
597 *t++ = 0x80 + (c & 63);
598 halfpair = 0;
599 } else {
600 #if ALLOW_BROKEN_UNICODE
601 /* The first UTF-16 unit of a surrogate pair has
602 * a value between 0xd800 and 0xdc00. It can be
603 * encoded as an individual UTF-8 sequence if we
604 * cannot combine it with the next UTF-16 unit
605 * unit as a surrogate pair. */
606 *t++ = 0xe0 | (halfpair >> 12);
607 *t++ = 0x80 | ((halfpair >> 6) & 0x3f);
608 *t++ = 0x80 | (halfpair & 0x3f);
609 halfpair = 0;
610
611 --i;
612 continue;
613 #else
614 goto fail;
615 #endif /* ALLOW_BROKEN_UNICODE */
616 }
617 } else if (c < 0x80) {
618 *t++ = c;
619 } else {
620 if (c < 0x800) {
621 *t++ = (0xc0 | ((c >> 6) & 0x3f));
622 *t++ = 0x80 | (c & 0x3f);
623 } else if (c < 0xd800) {
624 *t++ = 0xe0 | (c >> 12);
625 *t++ = 0x80 | ((c >> 6) & 0x3f);
626 *t++ = 0x80 | (c & 0x3f);
627 } else if (c < 0xdc00)
628 halfpair = c;
629 #if ALLOW_BROKEN_UNICODE
630 else if (c < 0xe000) {
631 *t++ = 0xe0 | (c >> 12);
632 *t++ = 0x80 | ((c >> 6) & 0x3f);
633 *t++ = 0x80 | (c & 0x3f);
634 }
635 #endif /* ALLOW_BROKEN_UNICODE */
636 else if (c >= 0xe000) {
637 *t++ = 0xe0 | (c >> 12);
638 *t++ = 0x80 | ((c >> 6) & 0x3f);
639 *t++ = 0x80 | (c & 0x3f);
640 } else
641 goto fail;
642 }
643 }
644 #if ALLOW_BROKEN_UNICODE
645 if (halfpair) { /* ending with a single surrogate */
646 *t++ = 0xe0 | (halfpair >> 12);
647 *t++ = 0x80 | ((halfpair >> 6) & 0x3f);
648 *t++ = 0x80 | (halfpair & 0x3f);
649 }
650 #endif /* ALLOW_BROKEN_UNICODE */
651 *t = '\0';
652
653 #if defined(__APPLE__) || defined(__DARWIN__)
654 #ifdef ENABLE_NFCONV
655 if(nfconvert_utf8 && (t - *outs) > 0) {
656 char *new_outs = NULL;
657 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
658 if(new_outs_len >= 0 && new_outs != NULL) {
659 if(original_outs_value != *outs) {
660 // We have allocated outs ourselves.
661 free(*outs);
662 *outs = new_outs;
663 t = *outs + new_outs_len;
664 }
665 else {
666 // We need to copy new_outs into the fixed outs buffer.
667 memset(*outs, 0, original_outs_len);
668 strncpy(*outs, new_outs, original_outs_len-1);
669 t = *outs + original_outs_len;
670 free(new_outs);
671 }
672 }
673 else {
674 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
675 ntfs_log_error(" new_outs=0x%p\n", new_outs);
676 ntfs_log_error(" new_outs_len=%d\n", new_outs_len);
677 }
678 }
679 #endif /* ENABLE_NFCONV */
680 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
681
682 ret = t - *outs;
683 out:
684 return ret;
685 fail:
686 errno = EILSEQ;
687 goto out;
688 }
689
690 /*
691 * Return the amount of 16-bit elements in UTF-16LE needed
692 * (without the terminating null) to store given UTF-8 string.
693 *
694 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
695 *
696 * Note: This does not check whether the input sequence is a valid utf8 string,
697 * and should be used only in context where such check is made!
698 */
utf8_to_utf16_size(const char * s)699 static int utf8_to_utf16_size(const char *s)
700 {
701 int ret = -1;
702 unsigned int byte;
703 size_t count = 0;
704
705 while ((byte = *((const unsigned char *)s++))) {
706 if (++count >= PATH_MAX)
707 goto fail;
708 if (byte >= 0xc0) {
709 if (byte >= 0xF5) {
710 errno = EILSEQ;
711 goto out;
712 }
713 if (!*s)
714 break;
715 if (byte >= 0xC0)
716 s++;
717 if (!*s)
718 break;
719 if (byte >= 0xE0)
720 s++;
721 if (!*s)
722 break;
723 if (byte >= 0xF0) {
724 s++;
725 if (++count >= PATH_MAX)
726 goto fail;
727 }
728 }
729 }
730 ret = count;
731 out:
732 return ret;
733 fail:
734 errno = ENAMETOOLONG;
735 goto out;
736 }
737 /*
738 * This converts one UTF-8 sequence to cpu-endian Unicode value
739 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
740 *
741 * Return the number of used utf8 bytes or -1 with errno set
742 * if sequence is invalid.
743 */
utf8_to_unicode(u32 * wc,const char * s)744 static int utf8_to_unicode(u32 *wc, const char *s)
745 {
746 unsigned int byte = *((const unsigned char *)s);
747
748 /* single byte */
749 if (byte == 0) {
750 *wc = (u32) 0;
751 return 0;
752 } else if (byte < 0x80) {
753 *wc = (u32) byte;
754 return 1;
755 /* double byte */
756 } else if (byte < 0xc2) {
757 goto fail;
758 } else if (byte < 0xE0) {
759 if ((s[1] & 0xC0) == 0x80) {
760 *wc = ((u32)(byte & 0x1F) << 6)
761 | ((u32)(s[1] & 0x3F));
762 return 2;
763 } else
764 goto fail;
765 /* three-byte */
766 } else if (byte < 0xF0) {
767 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
768 *wc = ((u32)(byte & 0x0F) << 12)
769 | ((u32)(s[1] & 0x3F) << 6)
770 | ((u32)(s[2] & 0x3F));
771 /* Check valid ranges */
772 #if ALLOW_BROKEN_UNICODE
773 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
774 || ((*wc >= 0xD800) && (*wc <= 0xDFFF))
775 || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
776 return 3;
777 #else
778 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
779 || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
780 return 3;
781 #endif /* ALLOW_BROKEN_UNICODE */
782 }
783 goto fail;
784 /* four-byte */
785 } else if (byte < 0xF5) {
786 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
787 && ((s[3] & 0xC0) == 0x80)) {
788 *wc = ((u32)(byte & 0x07) << 18)
789 | ((u32)(s[1] & 0x3F) << 12)
790 | ((u32)(s[2] & 0x3F) << 6)
791 | ((u32)(s[3] & 0x3F));
792 /* Check valid ranges */
793 if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
794 return 4;
795 }
796 goto fail;
797 }
798 fail:
799 errno = EILSEQ;
800 return -1;
801 }
802
803 /**
804 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
805 * @ins: input multibyte string buffer
806 * @outs: on return contains the (allocated) output utf16 string
807 * @outs_len: length of output buffer in utf16 characters
808 *
809 * Return -1 with errno set.
810 */
ntfs_utf8_to_utf16(const char * ins,ntfschar ** outs)811 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
812 {
813 #if defined(__APPLE__) || defined(__DARWIN__)
814 #ifdef ENABLE_NFCONV
815 char *new_ins = NULL;
816 if(nfconvert_utf8) {
817 int new_ins_len;
818 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
819 if(new_ins_len >= 0)
820 ins = new_ins;
821 else
822 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
823 }
824 #endif /* ENABLE_NFCONV */
825 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
826 const char *t = ins;
827 u32 wc;
828 BOOL allocated;
829 ntfschar *outpos;
830 int shorts, ret = -1;
831
832 shorts = utf8_to_utf16_size(ins);
833 if (shorts < 0)
834 goto fail;
835
836 allocated = FALSE;
837 if (!*outs) {
838 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
839 if (!*outs)
840 goto fail;
841 allocated = TRUE;
842 }
843
844 outpos = *outs;
845
846 while(1) {
847 int m = utf8_to_unicode(&wc, t);
848 if (m <= 0) {
849 if (m < 0) {
850 /* do not leave space allocated if failed */
851 if (allocated) {
852 free(*outs);
853 *outs = (ntfschar*)NULL;
854 }
855 goto fail;
856 }
857 *outpos++ = const_cpu_to_le16(0);
858 break;
859 }
860 if (wc < 0x10000)
861 *outpos++ = cpu_to_le16(wc);
862 else {
863 wc -= 0x10000;
864 *outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
865 *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
866 }
867 t += m;
868 }
869
870 ret = --outpos - *outs;
871 fail:
872 #if defined(__APPLE__) || defined(__DARWIN__)
873 #ifdef ENABLE_NFCONV
874 if(new_ins != NULL)
875 free(new_ins);
876 #endif /* ENABLE_NFCONV */
877 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
878 return ret;
879 }
880
881 /**
882 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
883 * @ins: input Unicode string buffer
884 * @ins_len: length of input string in Unicode characters
885 * @outs: on return contains the (allocated) output multibyte string
886 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
887 *
888 * Convert the input little endian, 2-byte Unicode string @ins, of length
889 * @ins_len into the multibyte string format dictated by the current locale.
890 *
891 * If *@outs is NULL, the function allocates the string and the caller is
892 * responsible for calling free(*@outs); when finished with it.
893 *
894 * On success the function returns the number of bytes written to the output
895 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
896 * string buffer was allocated, *@outs is set to it.
897 *
898 * On error, -1 is returned, and errno is set to the error code. The following
899 * error codes can be expected:
900 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
901 * EILSEQ The input string cannot be represented as a multibyte
902 * sequence according to the current locale.
903 * ENAMETOOLONG Destination buffer is too small for input string.
904 * ENOMEM Not enough memory to allocate destination buffer.
905 */
ntfs_ucstombs(const ntfschar * ins,const int ins_len,char ** outs,int outs_len)906 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
907 int outs_len)
908 {
909 char *mbs;
910 int mbs_len;
911 #ifdef MB_CUR_MAX
912 wchar_t wc;
913 int i, o;
914 int cnt = 0;
915 #ifdef HAVE_MBSINIT
916 mbstate_t mbstate;
917 #endif
918 #endif /* MB_CUR_MAX */
919
920 if (!ins || !outs) {
921 errno = EINVAL;
922 return -1;
923 }
924 mbs = *outs;
925 mbs_len = outs_len;
926 if (mbs && !mbs_len) {
927 errno = ENAMETOOLONG;
928 return -1;
929 }
930 if (use_utf8)
931 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
932 #ifdef MB_CUR_MAX
933 if (!mbs) {
934 mbs_len = (ins_len + 1) * MB_CUR_MAX;
935 mbs = ntfs_malloc(mbs_len);
936 if (!mbs)
937 return -1;
938 }
939 #ifdef HAVE_MBSINIT
940 memset(&mbstate, 0, sizeof(mbstate));
941 #else
942 wctomb(NULL, 0);
943 #endif
944 for (i = o = 0; i < ins_len; i++) {
945 /* Reallocate memory if necessary or abort. */
946 if ((int)(o + MB_CUR_MAX) > mbs_len) {
947 char *tc;
948 if (mbs == *outs) {
949 errno = ENAMETOOLONG;
950 return -1;
951 }
952 tc = ntfs_malloc((mbs_len + 64) & ~63);
953 if (!tc)
954 goto err_out;
955 memcpy(tc, mbs, mbs_len);
956 mbs_len = (mbs_len + 64) & ~63;
957 free(mbs);
958 mbs = tc;
959 }
960 /* Convert the LE Unicode character to a CPU wide character. */
961 wc = (wchar_t)le16_to_cpu(ins[i]);
962 if (!wc)
963 break;
964 /* Convert the CPU endian wide character to multibyte. */
965 #ifdef HAVE_MBSINIT
966 cnt = wcrtomb(mbs + o, wc, &mbstate);
967 #else
968 cnt = wctomb(mbs + o, wc);
969 #endif
970 if (cnt == -1)
971 goto err_out;
972 if (cnt <= 0) {
973 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
974 errno = EINVAL;
975 goto err_out;
976 }
977 o += cnt;
978 }
979 #ifdef HAVE_MBSINIT
980 /* Make sure we are back in the initial state. */
981 if (!mbsinit(&mbstate)) {
982 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
983 errno = EILSEQ;
984 goto err_out;
985 }
986 #endif
987 /* Now write the NULL character. */
988 mbs[o] = '\0';
989 if (*outs != mbs)
990 *outs = mbs;
991 return o;
992 err_out:
993 if (mbs != *outs) {
994 int eo = errno;
995 free(mbs);
996 errno = eo;
997 }
998 #else /* MB_CUR_MAX */
999 errno = EILSEQ;
1000 #endif /* MB_CUR_MAX */
1001 return -1;
1002 }
1003
1004 /**
1005 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
1006 * @ins: input multibyte string buffer
1007 * @outs: on return contains the (allocated) output Unicode string
1008 *
1009 * Convert the input multibyte string @ins, from the current locale into the
1010 * corresponding little endian, 2-byte Unicode string.
1011 *
1012 * The function allocates the string and the caller is responsible for calling
1013 * free(*@outs); when finished with it.
1014 *
1015 * On success the function returns the number of Unicode characters written to
1016 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
1017 * character.
1018 *
1019 * On error, -1 is returned, and errno is set to the error code. The following
1020 * error codes can be expected:
1021 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
1022 * EILSEQ The input string cannot be represented as a Unicode
1023 * string according to the current locale.
1024 * ENAMETOOLONG Destination buffer is too small for input string.
1025 * ENOMEM Not enough memory to allocate destination buffer.
1026 */
ntfs_mbstoucs(const char * ins,ntfschar ** outs)1027 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
1028 {
1029 #ifdef MB_CUR_MAX
1030 ntfschar *ucs;
1031 const char *s;
1032 wchar_t wc;
1033 int i, o, cnt, ins_len, ucs_len, ins_size;
1034 #ifdef HAVE_MBSINIT
1035 mbstate_t mbstate;
1036 #endif
1037 #endif /* MB_CUR_MAX */
1038
1039 if (!ins || !outs) {
1040 errno = EINVAL;
1041 return -1;
1042 }
1043
1044 if (use_utf8)
1045 return ntfs_utf8_to_utf16(ins, outs);
1046
1047 #ifdef MB_CUR_MAX
1048 /* Determine the size of the multi-byte string in bytes. */
1049 ins_size = strlen(ins);
1050 /* Determine the length of the multi-byte string. */
1051 s = ins;
1052 #if defined(HAVE_MBSINIT)
1053 memset(&mbstate, 0, sizeof(mbstate));
1054 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
1055 #ifdef __CYGWIN32__
1056 if (!ins_len && *ins) {
1057 /* Older Cygwin had broken mbsrtowcs() implementation. */
1058 ins_len = strlen(ins);
1059 }
1060 #endif
1061 #elif !defined(DJGPP)
1062 ins_len = mbstowcs(NULL, s, 0);
1063 #else
1064 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
1065 ins_len = strlen(ins);
1066 #endif
1067 if (ins_len == -1)
1068 return ins_len;
1069 #ifdef HAVE_MBSINIT
1070 if ((s != ins) || !mbsinit(&mbstate)) {
1071 #else
1072 if (s != ins) {
1073 #endif
1074 errno = EILSEQ;
1075 return -1;
1076 }
1077 /* Add the NULL terminator. */
1078 ins_len++;
1079 ucs_len = ins_len;
1080 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1081 if (!ucs)
1082 return -1;
1083 #ifdef HAVE_MBSINIT
1084 memset(&mbstate, 0, sizeof(mbstate));
1085 #else
1086 mbtowc(NULL, NULL, 0);
1087 #endif
1088 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1089 /* Reallocate memory if necessary. */
1090 if (o >= ucs_len) {
1091 ntfschar *tc;
1092 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1093 tc = realloc(ucs, ucs_len);
1094 if (!tc)
1095 goto err_out;
1096 ucs = tc;
1097 ucs_len /= sizeof(ntfschar);
1098 }
1099 /* Convert the multibyte character to a wide character. */
1100 #ifdef HAVE_MBSINIT
1101 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1102 #else
1103 cnt = mbtowc(&wc, ins + i, ins_size - i);
1104 #endif
1105 if (!cnt)
1106 break;
1107 if (cnt == -1)
1108 goto err_out;
1109 if (cnt < -1) {
1110 ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1111 errno = EINVAL;
1112 goto err_out;
1113 }
1114 /* Make sure we are not overflowing the NTFS Unicode set. */
1115 if ((unsigned long)wc >= (unsigned long)(1 <<
1116 (8 * sizeof(ntfschar)))) {
1117 errno = EILSEQ;
1118 goto err_out;
1119 }
1120 /* Convert the CPU wide character to a LE Unicode character. */
1121 ucs[o] = cpu_to_le16(wc);
1122 }
1123 #ifdef HAVE_MBSINIT
1124 /* Make sure we are back in the initial state. */
1125 if (!mbsinit(&mbstate)) {
1126 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1127 errno = EILSEQ;
1128 goto err_out;
1129 }
1130 #endif
1131 /* Now write the NULL character. */
1132 ucs[o] = const_cpu_to_le16(L'\0');
1133 *outs = ucs;
1134 return o;
1135 err_out:
1136 free(ucs);
1137 #else /* MB_CUR_MAX */
1138 errno = EILSEQ;
1139 #endif /* MB_CUR_MAX */
1140 return -1;
1141 }
1142
1143 /*
1144 * Turn a UTF8 name uppercase
1145 *
1146 * Returns an allocated uppercase name which has to be freed by caller
1147 * or NULL if there is an error (described by errno)
1148 */
1149
1150 char *ntfs_uppercase_mbs(const char *low,
1151 const ntfschar *upcase, u32 upcase_size)
1152 {
1153 int size;
1154 char *upp;
1155 u32 wc;
1156 int n;
1157 const char *s;
1158 char *t;
1159
1160 size = strlen(low);
1161 upp = (char*)ntfs_malloc(3*size + 1);
1162 if (upp) {
1163 s = low;
1164 t = upp;
1165 do {
1166 n = utf8_to_unicode(&wc, s);
1167 if (n > 0) {
1168 if (wc < upcase_size)
1169 wc = le16_to_cpu(upcase[wc]);
1170 if (wc < 0x80)
1171 *t++ = wc;
1172 else if (wc < 0x800) {
1173 *t++ = (0xc0 | ((wc >> 6) & 0x3f));
1174 *t++ = 0x80 | (wc & 0x3f);
1175 } else if (wc < 0x10000) {
1176 *t++ = 0xe0 | (wc >> 12);
1177 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1178 *t++ = 0x80 | (wc & 0x3f);
1179 } else {
1180 *t++ = 0xf0 | ((wc >> 18) & 7);
1181 *t++ = 0x80 | ((wc >> 12) & 63);
1182 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1183 *t++ = 0x80 | (wc & 0x3f);
1184 }
1185 s += n;
1186 }
1187 } while (n > 0);
1188 if (n < 0) {
1189 free(upp);
1190 upp = (char*)NULL;
1191 errno = EILSEQ;
1192 }
1193 *t = 0;
1194 }
1195 return (upp);
1196 }
1197
1198 /**
1199 * ntfs_upcase_table_build - build the default upcase table for NTFS
1200 * @uc: destination buffer where to store the built table
1201 * @uc_len: size of destination buffer in bytes
1202 *
1203 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1204 * stores it in the caller supplied buffer @uc of size @uc_len.
1205 *
1206 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1207 */
1208 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1209 {
1210 struct NEWUPPERCASE {
1211 unsigned short first;
1212 unsigned short last;
1213 short diff;
1214 unsigned char step;
1215 unsigned char osmajor;
1216 unsigned char osminor;
1217 } ;
1218
1219 /*
1220 * This is the table as defined by Windows XP
1221 */
1222 static int uc_run_table[][3] = { /* Start, End, Add */
1223 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
1224 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
1225 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1226 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
1227 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
1228 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
1229 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
1230 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
1231 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
1232 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
1233 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
1234 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
1235 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
1236 {0}
1237 };
1238 static int uc_dup_table[][2] = { /* Start, End */
1239 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1240 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1241 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1242 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1243 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1244 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1245 {0}
1246 };
1247 static int uc_byte_table[][2] = { /* Offset, Value */
1248 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1249 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1250 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1251 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1252 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1253 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1254 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1255 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1256 {0}
1257 };
1258
1259 /*
1260 * Changes which were applied to later Windows versions
1261 *
1262 * md5 for $UpCase from Winxp : 6fa3db2468275286210751e869d36373
1263 * Vista : 2f03b5a69d486ff3864cecbd07f24440
1264 * Win8 : 7ff498a44e45e77374cc7c962b1b92f2
1265 */
1266 static const struct NEWUPPERCASE newuppercase[] = {
1267 /* from Windows 6.0 (Vista) */
1268 { 0x37b, 0x37d, 0x82, 1, 6, 0 },
1269 { 0x1f80, 0x1f87, 0x8, 1, 6, 0 },
1270 { 0x1f90, 0x1f97, 0x8, 1, 6, 0 },
1271 { 0x1fa0, 0x1fa7, 0x8, 1, 6, 0 },
1272 { 0x2c30, 0x2c5e, -0x30, 1, 6, 0 },
1273 { 0x2d00, 0x2d25, -0x1c60, 1, 6, 0 },
1274 { 0x2c68, 0x2c6c, -0x1, 2, 6, 0 },
1275 { 0x219, 0x21f, -0x1, 2, 6, 0 },
1276 { 0x223, 0x233, -0x1, 2, 6, 0 },
1277 { 0x247, 0x24f, -0x1, 2, 6, 0 },
1278 { 0x3d9, 0x3e1, -0x1, 2, 6, 0 },
1279 { 0x48b, 0x48f, -0x1, 2, 6, 0 },
1280 { 0x4fb, 0x513, -0x1, 2, 6, 0 },
1281 { 0x2c81, 0x2ce3, -0x1, 2, 6, 0 },
1282 { 0x3f8, 0x3fb, -0x1, 3, 6, 0 },
1283 { 0x4c6, 0x4ce, -0x1, 4, 6, 0 },
1284 { 0x23c, 0x242, -0x1, 6, 6, 0 },
1285 { 0x4ed, 0x4f7, -0x1, 10, 6, 0 },
1286 { 0x450, 0x45d, -0x50, 13, 6, 0 },
1287 { 0x2c61, 0x2c76, -0x1, 21, 6, 0 },
1288 { 0x1fcc, 0x1ffc, -0x9, 48, 6, 0 },
1289 { 0x180, 0x180, 0xc3, 1, 6, 0 },
1290 { 0x195, 0x195, 0x61, 1, 6, 0 },
1291 { 0x19a, 0x19a, 0xa3, 1, 6, 0 },
1292 { 0x19e, 0x19e, 0x82, 1, 6, 0 },
1293 { 0x1bf, 0x1bf, 0x38, 1, 6, 0 },
1294 { 0x1f9, 0x1f9, -0x1, 1, 6, 0 },
1295 { 0x23a, 0x23a, 0x2a2b, 1, 6, 0 },
1296 { 0x23e, 0x23e, 0x2a28, 1, 6, 0 },
1297 { 0x26b, 0x26b, 0x29f7, 1, 6, 0 },
1298 { 0x27d, 0x27d, 0x29e7, 1, 6, 0 },
1299 { 0x280, 0x280, -0xda, 1, 6, 0 },
1300 { 0x289, 0x289, -0x45, 1, 6, 0 },
1301 { 0x28c, 0x28c, -0x47, 1, 6, 0 },
1302 { 0x3f2, 0x3f2, 0x7, 1, 6, 0 },
1303 { 0x4cf, 0x4cf, -0xf, 1, 6, 0 },
1304 { 0x1d7d, 0x1d7d, 0xee6, 1, 6, 0 },
1305 { 0x1fb3, 0x1fb3, 0x9, 1, 6, 0 },
1306 { 0x214e, 0x214e, -0x1c, 1, 6, 0 },
1307 { 0x2184, 0x2184, -0x1, 1, 6, 0 },
1308 /* from Windows 6.1 (Win7) */
1309 { 0x23a, 0x23e, 0x0, 4, 6, 1 },
1310 { 0x250, 0x250, 0x2a1f, 2, 6, 1 },
1311 { 0x251, 0x251, 0x2a1c, 2, 6, 1 },
1312 { 0x271, 0x271, 0x29fd, 2, 6, 1 },
1313 { 0x371, 0x373, -0x1, 2, 6, 1 },
1314 { 0x377, 0x377, -0x1, 2, 6, 1 },
1315 { 0x3c2, 0x3c2, 0x0, 2, 6, 1 },
1316 { 0x3d7, 0x3d7, -0x8, 2, 6, 1 },
1317 { 0x515, 0x523, -0x1, 2, 6, 1 },
1318 /* below, -0x75fc stands for 0x8a04 and truncation */
1319 { 0x1d79, 0x1d79, -0x75fc, 2, 6, 1 },
1320 { 0x1efb, 0x1eff, -0x1, 2, 6, 1 },
1321 { 0x1fc3, 0x1ff3, 0x9, 48, 6, 1 },
1322 { 0x1fcc, 0x1ffc, 0x0, 48, 6, 1 },
1323 { 0x2c65, 0x2c65, -0x2a2b, 2, 6, 1 },
1324 { 0x2c66, 0x2c66, -0x2a28, 2, 6, 1 },
1325 { 0x2c73, 0x2c73, -0x1, 2, 6, 1 },
1326 { 0xa641, 0xa65f, -0x1, 2, 6, 1 },
1327 { 0xa663, 0xa66d, -0x1, 2, 6, 1 },
1328 { 0xa681, 0xa697, -0x1, 2, 6, 1 },
1329 { 0xa723, 0xa72f, -0x1, 2, 6, 1 },
1330 { 0xa733, 0xa76f, -0x1, 2, 6, 1 },
1331 { 0xa77a, 0xa77c, -0x1, 2, 6, 1 },
1332 { 0xa77f, 0xa787, -0x1, 2, 6, 1 },
1333 { 0xa78c, 0xa78c, -0x1, 2, 6, 1 },
1334 /* end mark */
1335 { 0 }
1336 } ;
1337
1338 int i, r;
1339 int k, off;
1340 const struct NEWUPPERCASE *puc;
1341
1342 memset((char*)uc, 0, uc_len);
1343 uc_len >>= 1;
1344 if (uc_len > 65536)
1345 uc_len = 65536;
1346 for (i = 0; (u32)i < uc_len; i++)
1347 uc[i] = cpu_to_le16(i);
1348 for (r = 0; uc_run_table[r][0]; r++) {
1349 off = uc_run_table[r][2];
1350 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1351 uc[i] = cpu_to_le16(i + off);
1352 }
1353 for (r = 0; uc_dup_table[r][0]; r++)
1354 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1355 uc[i + 1] = cpu_to_le16(i);
1356 for (r = 0; uc_byte_table[r][0]; r++) {
1357 k = uc_byte_table[r][1];
1358 uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1359 }
1360 for (r=0; newuppercase[r].first; r++) {
1361 puc = &newuppercase[r];
1362 if ((puc->osmajor < UPCASE_MAJOR)
1363 || ((puc->osmajor == UPCASE_MAJOR)
1364 && (puc->osminor <= UPCASE_MINOR))) {
1365 off = puc->diff;
1366 for (i = puc->first; i <= puc->last; i += puc->step)
1367 uc[i] = cpu_to_le16(i + off);
1368 }
1369 }
1370 }
1371
1372 /*
1373 * Allocate and build the default upcase table
1374 *
1375 * Returns the number of entries
1376 * 0 if failed
1377 */
1378
1379 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1380
1381 u32 ntfs_upcase_build_default(ntfschar **upcase)
1382 {
1383 u32 upcase_len = 0;
1384
1385 *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1386 if (*upcase) {
1387 ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1388 upcase_len = UPCASE_LEN;
1389 }
1390 return (upcase_len);
1391 }
1392
1393 /*
1394 * Build a table for converting to lower case
1395 *
1396 * This is only meaningful when there is a single lower case
1397 * character leading to an upper case one, and currently the
1398 * only exception is the greek letter sigma which has a single
1399 * upper case glyph (code U+03A3), but two lower case glyphs
1400 * (code U+03C3 and U+03C2, the latter to be used at the end
1401 * of a word). In the following implementation the upper case
1402 * sigma will be lowercased as U+03C3.
1403 */
1404
1405 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1406 {
1407 ntfschar *lc;
1408 u32 upp;
1409 u32 i;
1410
1411 lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1412 if (lc) {
1413 for (i=0; i<uc_cnt; i++)
1414 lc[i] = cpu_to_le16(i);
1415 for (i=0; i<uc_cnt; i++) {
1416 upp = le16_to_cpu(uc[i]);
1417 if ((upp != i) && (upp < uc_cnt))
1418 lc[upp] = cpu_to_le16(i);
1419 }
1420 } else
1421 ntfs_log_error("Could not build the locase table\n");
1422 return (lc);
1423 }
1424
1425 /**
1426 * ntfs_str2ucs - convert a string to a valid NTFS file name
1427 * @s: input string
1428 * @len: length of output buffer in Unicode characters
1429 *
1430 * Convert the input @s string into the corresponding little endian,
1431 * 2-byte Unicode string. The length of the converted string is less
1432 * or equal to the maximum length allowed by the NTFS format (255).
1433 *
1434 * If @s is NULL then return AT_UNNAMED.
1435 *
1436 * On success the function returns the Unicode string in an allocated
1437 * buffer and the caller is responsible to free it when it's not needed
1438 * anymore.
1439 *
1440 * On error NULL is returned and errno is set to the error code.
1441 */
1442 ntfschar *ntfs_str2ucs(const char *s, int *len)
1443 {
1444 ntfschar *ucs = NULL;
1445
1446 if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1447 ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1448 return NULL;
1449 }
1450 if (*len > NTFS_MAX_NAME_LEN) {
1451 free(ucs);
1452 errno = ENAMETOOLONG;
1453 return NULL;
1454 }
1455 if (!ucs || !*len) {
1456 ucs = AT_UNNAMED;
1457 *len = 0;
1458 }
1459 return ucs;
1460 }
1461
1462 /**
1463 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1464 * @ucs input string to be freed
1465 *
1466 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1467 *
1468 * Return value: none.
1469 */
1470 void ntfs_ucsfree(ntfschar *ucs)
1471 {
1472 if (ucs && (ucs != AT_UNNAMED))
1473 free(ucs);
1474 }
1475
1476 /*
1477 * Check whether a name contains no chars forbidden
1478 * for DOS or Win32 use
1479 *
1480 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1481 * These names are technically allowed in the Win32 namespace, but
1482 * they can be problematic. See comment for FILE_NAME_WIN32.
1483 *
1484 * If there is a bad char, errno is set to EINVAL
1485 */
1486
1487 BOOL ntfs_forbidden_chars(const ntfschar *name, int len, BOOL strict)
1488 {
1489 BOOL forbidden;
1490 int ch;
1491 int i;
1492 static const u32 mainset = (1L << ('\"' - 0x20))
1493 | (1L << ('*' - 0x20))
1494 | (1L << ('/' - 0x20))
1495 | (1L << (':' - 0x20))
1496 | (1L << ('<' - 0x20))
1497 | (1L << ('>' - 0x20))
1498 | (1L << ('?' - 0x20));
1499
1500 forbidden = (len == 0) ||
1501 (strict && (name[len-1] == const_cpu_to_le16(' ') ||
1502 name[len-1] == const_cpu_to_le16('.')));
1503 for (i=0; i<len; i++) {
1504 ch = le16_to_cpu(name[i]);
1505 if ((ch < 0x20)
1506 || ((ch < 0x40)
1507 && ((1L << (ch - 0x20)) & mainset))
1508 || (ch == '\\')
1509 || (ch == '|'))
1510 forbidden = TRUE;
1511 }
1512 if (forbidden)
1513 errno = EINVAL;
1514 return (forbidden);
1515 }
1516
1517 /*
1518 * Check whether a name contains no forbidden chars and
1519 * is not a reserved name for DOS or Win32 use
1520 *
1521 * The reserved names are CON, PRN, AUX, NUL, COM1..COM9, LPT1..LPT9
1522 * with no suffix or any suffix.
1523 *
1524 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1525 * These names are technically allowed in the Win32 namespace, but
1526 * they can be problematic. See comment for FILE_NAME_WIN32.
1527 *
1528 * If the name is forbidden, errno is set to EINVAL
1529 */
1530
1531 BOOL ntfs_forbidden_names(ntfs_volume *vol, const ntfschar *name, int len,
1532 BOOL strict)
1533 {
1534 BOOL forbidden;
1535 int h;
1536 static const ntfschar dot = const_cpu_to_le16('.');
1537 static const ntfschar con[] = { const_cpu_to_le16('c'),
1538 const_cpu_to_le16('o'), const_cpu_to_le16('n') };
1539 static const ntfschar prn[] = { const_cpu_to_le16('p'),
1540 const_cpu_to_le16('r'), const_cpu_to_le16('n') };
1541 static const ntfschar aux[] = { const_cpu_to_le16('a'),
1542 const_cpu_to_le16('u'), const_cpu_to_le16('x') };
1543 static const ntfschar nul[] = { const_cpu_to_le16('n'),
1544 const_cpu_to_le16('u'), const_cpu_to_le16('l') };
1545 static const ntfschar com[] = { const_cpu_to_le16('c'),
1546 const_cpu_to_le16('o'), const_cpu_to_le16('m') };
1547 static const ntfschar lpt[] = { const_cpu_to_le16('l'),
1548 const_cpu_to_le16('p'), const_cpu_to_le16('t') };
1549
1550 forbidden = ntfs_forbidden_chars(name, len, strict);
1551 if (!forbidden && (len >= 3)) {
1552 /*
1553 * Rough hash check to tell whether the first couple of chars
1554 * may be one of CO PR AU NU LP or lowercase variants.
1555 */
1556 h = ((le16_to_cpu(name[0]) & 31)*48)
1557 ^ ((le16_to_cpu(name[1]) & 31)*165);
1558 if ((h % 23) == 17) {
1559 /* do a full check, depending on the third char */
1560 switch (le16_to_cpu(name[2]) & ~0x20) {
1561 case 'N' :
1562 if (((len == 3) || (name[3] == dot))
1563 && (!ntfs_ucsncasecmp(name, con, 3,
1564 vol->upcase, vol->upcase_len)
1565 || !ntfs_ucsncasecmp(name, prn, 3,
1566 vol->upcase, vol->upcase_len)))
1567 forbidden = TRUE;
1568 break;
1569 case 'X' :
1570 if (((len == 3) || (name[3] == dot))
1571 && !ntfs_ucsncasecmp(name, aux, 3,
1572 vol->upcase, vol->upcase_len))
1573 forbidden = TRUE;
1574 break;
1575 case 'L' :
1576 if (((len == 3) || (name[3] == dot))
1577 && !ntfs_ucsncasecmp(name, nul, 3,
1578 vol->upcase, vol->upcase_len))
1579 forbidden = TRUE;
1580 break;
1581 case 'M' :
1582 if ((len > 3)
1583 && (le16_to_cpu(name[3]) >= '1')
1584 && (le16_to_cpu(name[3]) <= '9')
1585 && ((len == 4) || (name[4] == dot))
1586 && !ntfs_ucsncasecmp(name, com, 3,
1587 vol->upcase, vol->upcase_len))
1588 forbidden = TRUE;
1589 break;
1590 case 'T' :
1591 if ((len > 3)
1592 && (le16_to_cpu(name[3]) >= '1')
1593 && (le16_to_cpu(name[3]) <= '9')
1594 && ((len == 4) || (name[4] == dot))
1595 && !ntfs_ucsncasecmp(name, lpt, 3,
1596 vol->upcase, vol->upcase_len))
1597 forbidden = TRUE;
1598 break;
1599 }
1600 }
1601 }
1602
1603 if (forbidden)
1604 errno = EINVAL;
1605 return (forbidden);
1606 }
1607
1608 /*
1609 * Check whether the same name can be used as a DOS and
1610 * a Win32 name
1611 *
1612 * The names must be the same, or the short name the uppercase
1613 * variant of the long name
1614 */
1615
1616 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1617 const ntfschar *shortname, int shortlen,
1618 const ntfschar *longname, int longlen)
1619 {
1620 BOOL collapsible;
1621 unsigned int ch;
1622 unsigned int cs;
1623 int i;
1624
1625 collapsible = shortlen == longlen;
1626 for (i=0; collapsible && (i<shortlen); i++) {
1627 ch = le16_to_cpu(longname[i]);
1628 cs = le16_to_cpu(shortname[i]);
1629 if ((cs != ch)
1630 && ((ch >= vol->upcase_len)
1631 || (cs >= vol->upcase_len)
1632 || (vol->upcase[cs] != vol->upcase[ch])))
1633 collapsible = FALSE;
1634 }
1635 return (collapsible);
1636 }
1637
1638 /*
1639 * Define the character encoding to be used.
1640 * Use UTF-8 unless specified otherwise.
1641 */
1642
1643 int ntfs_set_char_encoding(const char *locale)
1644 {
1645 use_utf8 = 0;
1646 if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1647 || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1648 use_utf8 = 1;
1649 else
1650 if (setlocale(LC_ALL, locale))
1651 use_utf8 = 0;
1652 else {
1653 ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1654 use_utf8 = 1;
1655 }
1656 return 0; /* always successful */
1657 }
1658
1659 #if defined(__APPLE__) || defined(__DARWIN__)
1660
1661 int ntfs_macosx_normalize_filenames(int normalize) {
1662 #ifdef ENABLE_NFCONV
1663 if (normalize == 0 || normalize == 1) {
1664 nfconvert_utf8 = normalize;
1665 return 0;
1666 }
1667 else {
1668 return -1;
1669 }
1670 #else
1671 return -1;
1672 #endif /* ENABLE_NFCONV */
1673 }
1674
1675 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1676 int composed)
1677 {
1678 #ifdef ENABLE_NFCONV
1679 /* For this code to compile, the CoreFoundation framework must be fed to
1680 * the linker. */
1681 CFStringRef cfSourceString;
1682 CFMutableStringRef cfMutableString;
1683 CFRange rangeToProcess;
1684 CFIndex requiredBufferLength;
1685 char *result = NULL;
1686 int resultLength = -1;
1687
1688 /* Convert the UTF-8 string to a CFString. */
1689 cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault,
1690 utf8_string, kCFStringEncodingUTF8);
1691 if (cfSourceString == NULL) {
1692 ntfs_log_error("CFStringCreateWithCString failed!\n");
1693 return -2;
1694 }
1695
1696 /* Create a mutable string from cfSourceString that we are free to
1697 * modify. */
1698 cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0,
1699 cfSourceString);
1700 CFRelease(cfSourceString); /* End-of-life. */
1701 if (cfMutableString == NULL) {
1702 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1703 return -3;
1704 }
1705
1706 /* Normalize the mutable string to the desired normalization form. */
1707 CFStringNormalize(cfMutableString, (composed != 0 ?
1708 kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1709
1710 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char*
1711 * buffer. */
1712 rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1713 if (CFStringGetBytes(cfMutableString, rangeToProcess,
1714 kCFStringEncodingUTF8, 0, false, NULL, 0,
1715 &requiredBufferLength) > 0)
1716 {
1717 resultLength = sizeof(char) * (requiredBufferLength + 1);
1718 result = ntfs_calloc(resultLength);
1719
1720 if (result != NULL) {
1721 if (CFStringGetBytes(cfMutableString, rangeToProcess,
1722 kCFStringEncodingUTF8, 0, false,
1723 (UInt8*) result, resultLength - 1,
1724 &requiredBufferLength) <= 0)
1725 {
1726 ntfs_log_error("Could not perform UTF-8 "
1727 "conversion of normalized "
1728 "CFMutableString.\n");
1729 free(result);
1730 result = NULL;
1731 }
1732 }
1733 else {
1734 ntfs_log_error("Could not perform a ntfs_calloc of %d "
1735 "bytes for char *result.\n", resultLength);
1736 }
1737 }
1738 else {
1739 ntfs_log_error("Could not perform check for required length of "
1740 "UTF-8 conversion of normalized CFMutableString.\n");
1741 }
1742
1743 CFRelease(cfMutableString);
1744
1745 if (result != NULL) {
1746 *target = result;
1747 return resultLength - 1;
1748 }
1749 else {
1750 return -1;
1751 }
1752 #else
1753 return -1;
1754 #endif /* ENABLE_NFCONV */
1755 }
1756 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
1757