switch ${%foo} to wcswidth-like behaviour – slightly problematic, and

the “set +U” case isn’t even handled committed to branch because I’d like to get more input on this, for now
2009-11-28 14:21:47 +00:00
parent 7063a9a921
commit 883d9d99b3
9 changed files with 37 additions and 21 deletions
--- a/expr.c
+++ b/expr.c
@ -22,7 +22,7 @@

 #include "sh.h"

-__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.37 2009/10/04 13:19:33 tg Exp $");
+__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.38 2009/11/28 14:21:44 tg Exp $");

 /* The order of these enums is constrained by the order of opinfo[] */
 enum token {
@ -683,8 +683,15 @@ utf_widthadj(const char *src, const char **dst)
 	return (width);
 }

+/**
+ * In lenient mode, characters of width -1 are handled as one column
+ * per octet (kind of as a strlen replacement). Users of lenient mo-
+ * de should reconsider the code.
+ * In strict mode, this behaves like wcswidth(3) and returns -1 upon
+ * encounter of a control multibyte character.
+ */
 int
-utf_mbswidth(const char *s)
+utf_mbswidth(const char *s, bool lenient)
 {
 	size_t len;
 	unsigned int wc;
@ -694,10 +701,14 @@ utf_mbswidth(const char *s)
 		return (strlen(s));

 	while (*s)
-		if (((len = utf_mbtowc(&wc, s)) == (size_t)-1) ||
-		    ((cw = utf_wcwidth(wc)) == -1)) {
+		if ((len = utf_mbtowc(&wc, s)) == (size_t)-1) {
+ by_octet:
 			s++;
 			width += 1;
+		} else if ((cw = utf_wcwidth(wc)) == -1) {
+			if (lenient)
+				goto by_octet;
+			return (-1);
 		} else {
 			s += len;
 			width += cw;