fnmatch.c raw

   1  /*
   2   * An implementation of what I call the "Sea of Stars" algorithm for
   3   * POSIX fnmatch(). The basic idea is that we factor the pattern into
   4   * a head component (which we match first and can reject without ever
   5   * measuring the length of the string), an optional tail component
   6   * (which only exists if the pattern contains at least one star), and
   7   * an optional "sea of stars", a set of star-separated components
   8   * between the head and tail. After the head and tail matches have
   9   * been removed from the input string, the components in the "sea of
  10   * stars" are matched sequentially by searching for their first
  11   * occurrence past the end of the previous match.
  12   *
  13   * - Rich Felker, April 2012
  14   */
  15  
  16  #include <string.h>
  17  #include <fnmatch.h>
  18  #include <stdlib.h>
  19  #include <wchar.h>
  20  #include <wctype.h>
  21  #include "locale_impl.h"
  22  
  23  #define END 0
  24  #define UNMATCHABLE -2
  25  #define BRACKET -3
  26  #define QUESTION -4
  27  #define STAR -5
  28  
  29  static int str_next(const char *str, size_t n, size_t *step)
  30  {
  31  	if (!n) {
  32  		*step = 0;
  33  		return 0;
  34  	}
  35  	if (str[0] >= 128U) {
  36  		wchar_t wc;
  37  		int k = mbtowc(&wc, str, n);
  38  		if (k<0) {
  39  			*step = 1;
  40  			return -1;
  41  		}
  42  		*step = k;
  43  		return wc;
  44  	}
  45  	*step = 1;
  46  	return str[0];
  47  }
  48  
  49  static int pat_next(const char *pat, size_t m, size_t *step, int flags)
  50  {
  51  	int esc = 0;
  52  	if (!m || !*pat) {
  53  		*step = 0;
  54  		return END;
  55  	}
  56  	*step = 1;
  57  	if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) {
  58  		*step = 2;
  59  		pat++;
  60  		esc = 1;
  61  		goto escaped;
  62  	}
  63  	if (pat[0]=='[') {
  64  		size_t k = 1;
  65  		if (k<m) if (pat[k] == '^' || pat[k] == '!') k++;
  66  		if (k<m) if (pat[k] == ']') k++;
  67  		for (; k<m && pat[k] && pat[k]!=']'; k++) {
  68  			if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) {
  69  				int z = pat[k+1];
  70  				k+=2;
  71  				if (k<m && pat[k]) k++;
  72  				while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++;
  73  				if (k==m || !pat[k]) break;
  74  			}
  75  		}
  76  		if (k==m || !pat[k]) {
  77  			*step = 1;
  78  			return '[';
  79  		}
  80  		*step = k+1;
  81  		return BRACKET;
  82  	}
  83  	if (pat[0] == '*')
  84  		return STAR;
  85  	if (pat[0] == '?')
  86  		return QUESTION;
  87  escaped:
  88  	if (pat[0] >= 128U) {
  89  		wchar_t wc;
  90  		int k = mbtowc(&wc, pat, m);
  91  		if (k<0) {
  92  			*step = 0;
  93  			return UNMATCHABLE;
  94  		}
  95  		*step = k + esc;
  96  		return wc;
  97  	}
  98  	return pat[0];
  99  }
 100  
 101  static int casefold(int k)
 102  {
 103  	int c = towupper(k);
 104  	return c == k ? towlower(k) : c;
 105  }
 106  
 107  static int match_bracket(const char *p, int k, int kfold)
 108  {
 109  	wchar_t wc;
 110  	int inv = 0;
 111  	p++;
 112  	if (*p=='^' || *p=='!') {
 113  		inv = 1;
 114  		p++;
 115  	}
 116  	if (*p==']') {
 117  		if (k==']') return !inv;
 118  		p++;
 119  	} else if (*p=='-') {
 120  		if (k=='-') return !inv;
 121  		p++;
 122  	}
 123  	wc = p[-1];
 124  	for (; *p != ']'; p++) {
 125  		if (p[0]=='-' && p[1]!=']') {
 126  			wchar_t wc2;
 127  			int l = mbtowc(&wc2, p+1, 4);
 128  			if (l < 0) return 0;
 129  			if (wc <= wc2)
 130  				if ((unsigned)k-wc <= wc2-wc ||
 131  				    (unsigned)kfold-wc <= wc2-wc)
 132  					return !inv;
 133  			p += l-1;
 134  			continue;
 135  		}
 136  		if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) {
 137  			const char *p0 = p+2;
 138  			int z = p[1];
 139  			p+=3;
 140  			while (p[-1]!=z || p[0]!=']') p++;
 141  			if (z == ':' && p-1-p0 < 16) {
 142  				char buf[16];
 143  				memcpy(buf, p0, p-1-p0);
 144  				buf[p-1-p0] = 0;
 145  				if (iswctype(k, wctype(buf)) ||
 146  				    iswctype(kfold, wctype(buf)))
 147  					return !inv;
 148  			}
 149  			continue;
 150  		}
 151  		if (*p < 128U) {
 152  			wc = (unsigned char)*p;
 153  		} else {
 154  			int l = mbtowc(&wc, p, 4);
 155  			if (l < 0) return 0;
 156  			p += l-1;
 157  		}
 158  		if (wc==k || wc==kfold) return !inv;
 159  	}
 160  	return inv;
 161  }
 162  
 163  static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags)
 164  {
 165  	const char *p, *ptail, *endpat;
 166  	const char *s, *stail, *endstr;
 167  	size_t pinc, sinc, tailcnt=0;
 168  	int c, k, kfold;
 169  
 170  	if (flags & FNM_PERIOD) {
 171  		if (*str == '.' && *pat != '.')
 172  			return FNM_NOMATCH;
 173  	}
 174  	for (;;) {
 175  		switch ((c = pat_next(pat, m, &pinc, flags))) {
 176  		case UNMATCHABLE:
 177  			return FNM_NOMATCH;
 178  		case STAR:
 179  			pat++;
 180  			m--;
 181  			break;
 182  		default:
 183  			k = str_next(str, n, &sinc);
 184  			if (k <= 0)
 185  				return (c==END) ? 0 : FNM_NOMATCH;
 186  			str += sinc;
 187  			n -= sinc;
 188  			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
 189  			if (c == BRACKET) {
 190  				if (!match_bracket(pat, k, kfold))
 191  					return FNM_NOMATCH;
 192  			} else if (c != QUESTION && k != c && kfold != c) {
 193  				return FNM_NOMATCH;
 194  			}
 195  			pat+=pinc;
 196  			m-=pinc;
 197  			continue;
 198  		}
 199  		break;
 200  	}
 201  
 202  	/* Compute real pat length if it was initially unknown/-1 */
 203  	m = strnlen(pat, m);
 204  	endpat = pat + m;
 205  
 206  	/* Find the last * in pat and count chars needed after it */
 207  	for (p=ptail=pat; p<endpat; p+=pinc) {
 208  		switch (pat_next(p, endpat-p, &pinc, flags)) {
 209  		case UNMATCHABLE:
 210  			return FNM_NOMATCH;
 211  		case STAR:
 212  			tailcnt=0;
 213  			ptail = p+1;
 214  			break;
 215  		default:
 216  			tailcnt++;
 217  			break;
 218  		}
 219  	}
 220  
 221  	/* Past this point we need not check for UNMATCHABLE in pat,
 222  	 * because all of pat has already been parsed once. */
 223  
 224  	/* Compute real str length if it was initially unknown/-1 */
 225  	n = strnlen(str, n);
 226  	endstr = str + n;
 227  	if (n < tailcnt) return FNM_NOMATCH;
 228  
 229  	/* Find the final tailcnt chars of str, accounting for UTF-8.
 230  	 * On illegal sequences we may get it wrong, but in that case
 231  	 * we necessarily have a matching failure anyway. */
 232  	for (s=endstr; s>str && tailcnt; tailcnt--) {
 233  		if (s[-1] < 128U || MB_CUR_MAX==1) s--;
 234  		else while ((unsigned char)*--s-0x80U<0x40 && s>str);
 235  	}
 236  	if (tailcnt) return FNM_NOMATCH;
 237  	stail = s;
 238  
 239  	/* Check that the pat and str tails match */
 240  	p = ptail;
 241  	for (;;) {
 242  		c = pat_next(p, endpat-p, &pinc, flags);
 243  		p += pinc;
 244  		if ((k = str_next(s, endstr-s, &sinc)) <= 0) {
 245  			if (c != END) return FNM_NOMATCH;
 246  			break;
 247  		}
 248  		s += sinc;
 249  		kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
 250  		if (c == BRACKET) {
 251  			if (!match_bracket(p-pinc, k, kfold))
 252  				return FNM_NOMATCH;
 253  		} else if (c != QUESTION && k != c && kfold != c) {
 254  			return FNM_NOMATCH;
 255  		}
 256  	}
 257  
 258  	/* We're all done with the tails now, so throw them out */
 259  	endstr = stail;
 260  	endpat = ptail;
 261  
 262  	/* Match pattern components until there are none left */
 263  	while (pat<endpat) {
 264  		p = pat;
 265  		s = str;
 266  		for (;;) {
 267  			c = pat_next(p, endpat-p, &pinc, flags);
 268  			p += pinc;
 269  			/* Encountering * completes/commits a component */
 270  			if (c == STAR) {
 271  				pat = p;
 272  				str = s;
 273  				break;
 274  			}
 275  			k = str_next(s, endstr-s, &sinc);
 276  			if (!k)
 277  				return FNM_NOMATCH;
 278  			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
 279  			if (c == BRACKET) {
 280  				if (!match_bracket(p-pinc, k, kfold))
 281  					break;
 282  			} else if (c != QUESTION && k != c && kfold != c) {
 283  				break;
 284  			}
 285  			s += sinc;
 286  		}
 287  		if (c == STAR) continue;
 288  		/* If we failed, advance str, by 1 char if it's a valid
 289  		 * char, or past all invalid bytes otherwise. */
 290  		k = str_next(str, endstr-str, &sinc);
 291  		if (k > 0) str += sinc;
 292  		else for (str++; str_next(str, endstr-str, &sinc)<0; str++);
 293  	}
 294  
 295  	return 0;
 296  }
 297  
 298  int fnmatch(const char *pat, const char *str, int flags)
 299  {
 300  	const char *s, *p;
 301  	size_t inc;
 302  	int c;
 303  	if (flags & FNM_PATHNAME) for (;;) {
 304  		for (s=str; *s && *s!='/'; s++);
 305  		for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc);
 306  		if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR)))
 307  			return FNM_NOMATCH;
 308  		if (fnmatch_internal(pat, p-pat, str, s-str, flags))
 309  			return FNM_NOMATCH;
 310  		if (!c) return 0;
 311  		str = s+1;
 312  		pat = p+inc;
 313  	} else if (flags & FNM_LEADING_DIR) {
 314  		for (s=str; *s; s++) {
 315  			if (*s != '/') continue;
 316  			if (!fnmatch_internal(pat, -1, str, s-str, flags))
 317  				return 0;
 318  		}
 319  	}
 320  	return fnmatch_internal(pat, -1, str, -1, flags);
 321  }
 322