Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 581368f

Browse files
vmgEdward Thomson
authored andcommitted
path: Use UTF8 iteration for HFS chars
1 parent 4320180 commit 581368f

File tree

3 files changed

+132
-81
lines changed

3 files changed

+132
-81
lines changed

src/path.c

Lines changed: 45 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,93 +1199,57 @@ GIT_INLINE(bool) verify_dospath(
11991199
component[last] != ':');
12001200
}
12011201

1202-
GIT_INLINE(bool) verify_dotgit_hfs(const char *component, size_t len)
1202+
static int32_t next_hfs_char(const char **in, size_t *len)
12031203
{
1204-
const unsigned char *c;
1205-
int git = 0, ign = 0;
1206-
unsigned char one, two;
1207-
1208-
while (len) {
1209-
switch (*(c = (const unsigned char *)component++)) {
1210-
case '.':
1211-
if (ign || git++ != 0)
1212-
return true;
1213-
break;
1214-
case 'g':
1215-
case 'G':
1216-
if (ign || git++ != 1)
1217-
return true;
1218-
break;
1219-
case 'i':
1220-
case 'I':
1221-
if (ign || git++ != 2)
1222-
return true;
1223-
break;
1224-
case 't':
1225-
case 'T':
1226-
if (ign || git++ != 3)
1227-
return true;
1228-
break;
1229-
1230-
case 0xe2:
1231-
case 0xef:
1232-
if (ign++ != 0)
1233-
return true;
1234-
one = *c;
1235-
break;
1236-
1237-
case 0x80:
1238-
case 0x81:
1239-
if (ign++ != 1 || one != 0xe2)
1240-
return true;
1241-
two = *c;
1242-
break;
1243-
1244-
case 0xbb:
1245-
if (ign++ != 1 || one != 0xef)
1246-
return true;
1247-
two = *c;
1248-
break;
1249-
1250-
case 0x8c:
1251-
case 0x8d:
1252-
case 0x8e:
1253-
case 0x8f:
1254-
if (ign != 2 || two != 0x80)
1255-
return true;
1256-
ign = 0;
1257-
break;
1258-
1259-
case 0xaa:
1260-
case 0xab:
1261-
case 0xac:
1262-
case 0xad:
1263-
case 0xae:
1264-
if (ign != 2 || (two != 0x80 && two != 0x81))
1265-
return true;
1266-
ign = 0;
1267-
break;
1268-
1269-
case 0xaf:
1270-
if (ign != 2 || two != 0x81)
1271-
return true;
1272-
ign = 0;
1273-
break;
1274-
1275-
case 0xbf:
1276-
if (ign != 2 || two != 0xbb)
1277-
return true;
1278-
ign = 0;
1279-
break;
1204+
while (*len) {
1205+
int32_t codepoint;
1206+
int cp_len = git__utf8_iterate((const uint8_t *)(*in), (int)(*len), &codepoint);
1207+
if (cp_len < 0)
1208+
return -1;
12801209

1281-
default:
1282-
return true;
1210+
(*in) += cp_len;
1211+
(*len) -= cp_len;
1212+
1213+
/* these code points are ignored completely */
1214+
switch (codepoint) {
1215+
case 0x200c: /* ZERO WIDTH NON-JOINER */
1216+
case 0x200d: /* ZERO WIDTH JOINER */
1217+
case 0x200e: /* LEFT-TO-RIGHT MARK */
1218+
case 0x200f: /* RIGHT-TO-LEFT MARK */
1219+
case 0x202a: /* LEFT-TO-RIGHT EMBEDDING */
1220+
case 0x202b: /* RIGHT-TO-LEFT EMBEDDING */
1221+
case 0x202c: /* POP DIRECTIONAL FORMATTING */
1222+
case 0x202d: /* LEFT-TO-RIGHT OVERRIDE */
1223+
case 0x202e: /* RIGHT-TO-LEFT OVERRIDE */
1224+
case 0x206a: /* INHIBIT SYMMETRIC SWAPPING */
1225+
case 0x206b: /* ACTIVATE SYMMETRIC SWAPPING */
1226+
case 0x206c: /* INHIBIT ARABIC FORM SHAPING */
1227+
case 0x206d: /* ACTIVATE ARABIC FORM SHAPING */
1228+
case 0x206e: /* NATIONAL DIGIT SHAPES */
1229+
case 0x206f: /* NOMINAL DIGIT SHAPES */
1230+
case 0xfeff: /* ZERO WIDTH NO-BREAK SPACE */
1231+
continue;
12831232
}
12841233

1285-
len--;
1234+
/* fold into lowercase -- this will only fold characters in
1235+
* the ASCII range, which is perfectly fine, because the
1236+
* git folder name can only be composed of ascii characters
1237+
*/
1238+
return tolower(codepoint);
12861239
}
1240+
return 0; /* NULL byte -- end of string */
1241+
}
1242+
1243+
static bool verify_dotgit_hfs(const char *path, size_t len)
1244+
{
1245+
if (next_hfs_char(&path, &len) != '.' ||
1246+
next_hfs_char(&path, &len) != 'g' ||
1247+
next_hfs_char(&path, &len) != 'i' ||
1248+
next_hfs_char(&path, &len) != 't' ||
1249+
next_hfs_char(&path, &len) != 0)
1250+
return true;
12871251

1288-
return (ign || git != 4);
1252+
return false;
12891253
}
12901254

12911255
GIT_INLINE(bool) verify_char(unsigned char c, unsigned int flags)

src/util.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,3 +663,79 @@ void git__insertsort_r(
663663
if (freeswap)
664664
git__free(swapel);
665665
}
666+
667+
static const int8_t utf8proc_utf8class[256] = {
668+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
669+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
670+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
671+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
672+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
673+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
674+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
675+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
676+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
677+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
678+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
679+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
680+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
681+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
682+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
683+
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
684+
};
685+
686+
int git__utf8_charlen(const uint8_t *str, int str_len)
687+
{
688+
int length, i;
689+
690+
length = utf8proc_utf8class[str[0]];
691+
if (!length)
692+
return -1;
693+
694+
if (str_len >= 0 && length > str_len)
695+
return -str_len;
696+
697+
for (i = 1; i < length; i++) {
698+
if ((str[i] & 0xC0) != 0x80)
699+
return -i;
700+
}
701+
702+
return length;
703+
}
704+
705+
int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst)
706+
{
707+
int length;
708+
int32_t uc = -1;
709+
710+
*dst = -1;
711+
length = git__utf8_charlen(str, str_len);
712+
if (length < 0)
713+
return -1;
714+
715+
switch (length) {
716+
case 1:
717+
uc = str[0];
718+
break;
719+
case 2:
720+
uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
721+
if (uc < 0x80) uc = -1;
722+
break;
723+
case 3:
724+
uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
725+
+ (str[2] & 0x3F);
726+
if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
727+
(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
728+
break;
729+
case 4:
730+
uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
731+
+ ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
732+
if (uc < 0x10000 || uc >= 0x110000) uc = -1;
733+
break;
734+
}
735+
736+
if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
737+
return -1;
738+
739+
*dst = uc;
740+
return length;
741+
}

src/util.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,17 @@ extern int git__date_rfc2822_fmt(char *out, size_t len, const git_time *date);
367367
*/
368368
extern size_t git__unescape(char *str);
369369

370+
/*
371+
* Iterate through an UTF-8 string, yielding one
372+
* codepoint at a time.
373+
*
374+
* @param str current position in the string
375+
* @param str_len size left in the string; -1 if the string is NULL-terminated
376+
* @param dst pointer where to store the current codepoint
377+
* @return length in bytes of the read codepoint; -1 if the codepoint was invalid
378+
*/
379+
extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
380+
370381
/*
371382
* Safely zero-out memory, making sure that the compiler
372383
* doesn't optimize away the operation.

0 commit comments

Comments
 (0)