new function unbksl doing "backslash expansion" independent of c_print();

also make a separate subsection about it in the manpage
2009-09-19 15:16:05 +00:00 · 2009-09-19 15:16:05 +00:00 · e0f000fb83
commit e0f000fb83
parent 3639137e48
4 changed files with 157 additions and 108 deletions
--- a/funcs.c
+++ b/funcs.c
@ -25,7 +25,7 @@

 #include "sh.h"

-__RCSID("$MirOS: src/bin/mksh/funcs.c,v 1.130 2009/09/07 17:24:48 tg Exp $");
+__RCSID("$MirOS: src/bin/mksh/funcs.c,v 1.131 2009/09/19 15:16:02 tg Exp $");

 #if HAVE_KILLPG
 /*
@ -604,93 +604,30 @@ c_print(const char **wp)
 		while ((c = *s++) != '\0') {
 			Xcheck(xs, xp);
 			if ((flags & PO_EXPAND) && c == '\\') {
-				int i;
+				if ((c = unbksl(&s)) == -1) {
+					/* rejected by generic function */
+					switch ((c = *s++)) {
+					case 'c':
+						flags &= ~PO_NL;
+						/* AT&T brain damage */
+						continue;
+					case '\0':
+						s--;
+						c = '\\';
+						break;
+					default:
+						Xput(xs, xp, '\\');
+					}
+				} else if (c > 0xFF) {
+					/* generic function returned Unicode */
+					char ts[4];

-				switch ((c = *s++)) {
-				/* Oddly enough, \007 seems more portable than
-				 * \a (due to HP-UX cc, Ultrix cc, old PCCs,
-				 * etc.).
-				 */
-				case 'a': c = '\007'; break;
-				case 'b': c = '\b'; break;
-				case 'c':
-					flags &= ~PO_NL;
-					/* AT&T brain damage */
+					c = utf_wctomb(ts, c - 0x100);
+					ts[c] = 0;
+					for (c = 0; ts[c]; ++c)
+						Xput(xs, xp, ts[c]);
 					continue;
-				case 'f': c = '\f'; break;
-				case 'n': c = '\n'; break;
-				case 'r': c = '\r'; break;
-				case 't': c = '\t'; break;
-				case 'v': c = 0x0B; break;
-				case '0':
-					/* Look for an octal number: can have
-					 * three digits (not counting the
-					 * leading 0). Truly burnt.
-					 */
-					c = 0;
-					for (i = 0; i < 3; i++) {
-						if (*s >= '0' && *s <= '7')
-							c = c*8 + *s++ - '0';
-						else
-							break;
-					}
-					break;
-				case 'x':
-					/* Look for a hexadecimal number of
-					 * up to 2 digits, write raw octet.
-					 */
-					c = 0;
-					for (i = 0; i < 2; i++) {
-						c <<= 4;
-						if (*s >= '0' && *s <= '9')
-							c += *s++ - '0';
-						else if (*s >= 'A' && *s <= 'F')
-							c += *s++ - 'A' + 10;
-						else if (*s >= 'a' && *s <= 'f')
-							c += *s++ - 'a' + 10;
-						else {
-							c >>= 4;
-							break;
-						}
-					}
-					break;
-				case 'u':
-					/* Look for a hexadecimal number of
-					 * up to 4 digits, write Unicode.
-					 */
-					c = 0;
-					for (i = 0; i < 4; i++) {
-						c <<= 4;
-						if (*s >= '0' && *s <= '9')
-							c += *s++ - '0';
-						else if (*s >= 'A' && *s <= 'F')
-							c += *s++ - 'A' + 10;
-						else if (*s >= 'a' && *s <= 'f')
-							c += *s++ - 'a' + 10;
-						else {
-							c >>= 4;
-							break;
-						}
-					}
-					if (c < 0x80)
-						/* Xput below writes ASCII */;
-					else if (c < 0x0800) {
-						Xput(xs, xp, (c >> 6) | 0xC0);
-						c = 0x80 | (c & 0x3F);
-						/* leave 2nd octet to below */
-					} else {
-						Xput(xs, xp, (c >> 12) | 0xE0);
-						Xput(xs, xp,
-						    ((c >> 6) & 0x3F) | 0x80);
-						c = 0x80 | (c & 0x3F);
-						/* leave 3rd octet to below */
-					}
-					break;
-				case '\0': s--; c = '\\'; break;
-				case '\\': break;
-				default:
-					Xput(xs, xp, '\\');
-				}
+				}			
 			}
 			Xput(xs, xp, c);
 		}
--- a/misc.c
+++ b/misc.c
@ -29,7 +29,7 @@
 #include <grp.h>
 #endif

-__RCSID("$MirOS: src/bin/mksh/misc.c,v 1.118 2009/08/30 13:30:07 tg Exp $");
+__RCSID("$MirOS: src/bin/mksh/misc.c,v 1.119 2009/09/19 15:16:03 tg Exp $");

 #undef USE_CHVT
 /* XXX conditions correct? */
@ -1447,3 +1447,101 @@ getrusage(int what, struct rusage *ru)
 	return (0);
 }
 #endif
+
+/*
+ * process the string at *sp for backslash escapes,
+ * assuming (*sp)[-1] was the backslash; return the
+ * character ([0;0xFF]), Unicode (wc+0x100), or -1
+ * if none found; *sp afterwards points to the first
+ * unprocessed character (unchanged if rv=-1)
+ */
+int
+unbksl(const char **sp)
+{
+	int wc, i;
+	const char *cp = (*sp);
+
+	switch (*cp++) {
+	case 'a':
+		/*
+		 * according to the comments in pdksh, \007 seems
+		 * to be more portable than \a (due to HP-UX cc,
+		 * Ultrix cc, old pcc, etc.) so we avoid the escape
+		 * sequence altogether in mksh and assume ASCII
+		 */
+		wc = 7;
+		break;
+	case 'b':
+		wc = '\b';
+		break;
+	case 'f':
+		wc = '\f';
+		break;
+	case 'n':
+		wc = '\n';
+		break;
+	case 'r':
+		wc = '\r';
+		break;
+	case 't':
+		wc = '\t';
+		break;
+	case 'v':
+		/* assume ASCII here as well */
+		wc = 11;
+		break;
+	case '0':
+		/*
+		 * look for an octal number with up to three
+		 * digits, not counting the leading zero;
+		 * convert it to a raw octet
+		 */
+		wc = 0;
+		i = 3;
+		while (i-- && *cp >= '0' && *cp <= '7')
+			wc = (wc << 3) + (*cp++ - '0');
+		break;
+	case 'U':
+		i = 8;
+		if (0)
+		/* FALLTHROUGH */
+	case 'u':
+		i = 4;
+		if (0)
+		/* FALLTHROUGH */
+	case 'x':
+		i = 2;
+		/*
+		 * x: look for a hexadecimal number with up to
+		 *    two digits; convert to raw octet
+		 * u: look for a hexadecimal number with up to
+		 *    four (U: eight) digits; convert to Unicode
+		 */
+		wc = 0;
+		while (i--) {
+			wc <<= 4;
+			if (*cp >= '0' && *cp <= '9')
+				wc += *cp++ - '0';
+			else if (*cp >= 'A' && *cp <= 'F')
+				wc += *cp++ - 'A' + 10;
+			else if (*cp >= 'a' && *cp <= 'f')
+				wc += *cp++ - 'a' + 10;
+			else {
+				wc >>= 4;
+				break;
+			}
+		}
+		if (**sp != 'x')
+			/* Unicode marker */
+			wc += 0x100;
+		break;
+	case '\\':
+		wc = '\\';
+		break;
+	default:
+		return (-1);
+	}
+
+	(*sp) = cp;
+	return (wc);
+}
--- a/mksh.1
+++ b/mksh.1
@ -1,4 +1,4 @@
-.\" $MirOS: src/bin/mksh/mksh.1,v 1.184 2009/09/07 17:24:49 tg Exp $
+.\" $MirOS: src/bin/mksh/mksh.1,v 1.185 2009/09/19 15:16:04 tg Exp $
 .\" $OpenBSD: ksh.1,v 1.129 2009/05/28 06:09:06 jmc Exp $
 .\"-
 .\" Copyright © 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
@ -48,7 +48,7 @@
 .el .xD \\$1 \\$2 \\$3 \\$4 \\$5 \\$6 \\$7 \\$8
 ..
 .\"-
-.Dd $Mdocdate: September 7 2009 $
+.Dd $Mdocdate: September 19 2009 $
 .Dt MKSH 1
 .Os MirBSD
 .Sh NAME
@ -868,6 +868,32 @@ the
 and the newline are stripped; otherwise, both the
 .Ql \e
 and the character following are unchanged.
+.Ss Backslash expansion
+In places where backslashes are expanded, certain C and
+.At
+.Nm ksh
+or GNU
+.Nm bash
+style escapes are translated.
+These include
+.Ql \ea ,
+.Ql \eb ,
+.Ql \ef ,
+.Ql \en ,
+.Ql \er ,
+.Ql \et ,
+.Ql \eU######## ,
+.Ql \eu#### ,
+.Ql \ev ,
+.Ql \ex## ,
+and
+.Ql \e0### ;
+.Ql #
+is, in the case of \e0###, an octal, or, in the case of \ex##,
+\eu#### or \eU########, a hexadecimal digit, of which there may
+be none up to two (x), three (0), four (u), or eight (U).
+The \ex## and \e0### escapes translate to raw 8-bit octets;
+the \eu#### and \eU######## escapes translate a Unicode codepoint to UTF-8.
 .Ss Aliases
 There are two types of aliases: normal command aliases and tracked aliases.
 Command aliases are normally used as a short hand for a long or often used
@ -3266,24 +3292,11 @@ The
 .Fl n
 option suppresses the newline.
 By default, certain C escapes are translated.
-These include
-.Ql \eb ,
-.Ql \ef ,
-.Ql \en ,
-.Ql \er ,
-.Ql \et ,
-.Ql \eu#### ,
-.Ql \ev ,
-.Ql \ex## ,
-and
-.Ql \e0### ;
-.Ql #
-is, in the case of \e0###, an octal, or, in the case of \eu####
-or \ex##, a hexadecimal digit, of which there may be 0 to 2/3/4.
-The \ex## and \e0### escapes translate to raw 8-bit octets;
-the \eu#### escape translates a Unicode codepoint to UTF-8.
-.Ql \ec
-is equivalent to using the
+These include at least these mentioned in
+.Sx Backslash expansion
+above, as well as
+.Ql \ec ,
+which is equivalent to using the
 .Fl n
 option.
 .Ql \e
--- a/sh.h
+++ b/sh.h
@ -134,7 +134,7 @@
 #endif

 #ifdef EXTERN
-__RCSID("$MirOS: src/bin/mksh/sh.h,v 1.340 2009/09/07 17:24:49 tg Exp $");
+__RCSID("$MirOS: src/bin/mksh/sh.h,v 1.341 2009/09/19 15:16:05 tg Exp $");
 #endif
 #define MKSH_VERSION "R39 2009/09/07"

@ -1587,6 +1587,7 @@ void set_current_wd(char *);
 char *strdup_(const char *, Area *);
 char *strndup_(const char *, size_t, Area *);
 #endif
+int unbksl(const char **);
 /* shf.c */
 struct shf *shf_open(const char *, int, int, int);
 struct shf *shf_fdopen(int, int, struct shf *);