/* -*- linux-c -*- * String Functions * Copyright (C) 2005-2018 Red Hat Inc. * * This file is part of systemtap, and is free software. You can * redistribute it and/or modify it under the terms of the GNU General * Public License (GPL); either version 2, or (at your option) any * later version. */ #ifndef _STP_STRING_C_ #define _STP_STRING_C_ #include "stp_string.h" /** @file stp_string.c * @brief Implements string functions. */ /** @addtogroup string String Functions * * @{ */ /** Sprintf into a string. * Like printf, except output goes into a string. * * NB: these are script language printf formatting directives, where * %d ints are 64-bits etc, so we can't use gcc level attribute printf * to type-check the arguments. * * @param str string * @param fmt A printf-style format string followed by a * variable number of args. */ static int _stp_snprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = _stp_vsnprintf(buf,size,fmt,args); va_end(args); return i; } static int _stp_vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { unsigned i = _stp_vsnprintf(buf,size,fmt,args); return (i >= size) ? (size - 1) : i; } /** * Decode a UTF-8 sequence into its codepoint. * * @param buf The input buffer. * @param size The size of the input buffer. * @param user Flag to mark user memory, vs kernel. * @param c_ret The return pointer for the codepoint. * * @return The number of bytes consumed, * or -EFAULT for unreadable memory address. */ static int _stp_decode_utf8(const char* buf, int size, int user, int* c_ret) { int c; char b = '\0'; int i, n; if (size <= 0) return -EFAULT; if (_stp_deref_nofault(b, 1, buf, (user ? STP_USER_DS : STP_KERNEL_DS))) return -EFAULT; ++buf; --size; if ((b & 0xE0) == 0xC0 && size >= 1) { /* 110xxxxx 10xxxxxx */ /* Two-byte UTF-8, one more byte to read. */ n = 2; c = b & 0x1F; } else if ((b & 0xF0) == 0xE0 && size >= 2) { /* 1110xxxx 10xxxxxx 10xxxxxx */ /* Three-byte UTF-8, two more bytes to read. */ n = 3; c = b & 0xF; } else if ((b & 0xF8) == 0xF0 && size >= 3) { /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ /* Four-byte UTF-8, three more bytes to read. */ n = 4; c = b & 0x7; } else { /* Return everything else verbatim, whether it's ASCII, longer * UTF-8 (against RFC 3629), invalid UTF-8, or just not enough * bytes left in the input buffer. */ goto verbatim; } /* Mix in the UTF-8 continuation bytes. */ for (i = 1; i < n; ++i) { char b2 = '\0'; if (_stp_deref_nofault(b2, 1, buf, (user ? STP_USER_DS : STP_KERNEL_DS))) return -EFAULT; ++buf; --size; if ((b2 & 0xC0) != 0x80) /* Bad continuation. */ goto verbatim; c = (c << 6) | (b2 & 0x3F); } /* Reject UTF-16 surrogates. */ if (0xD800 <= c && c <= 0xDFFF) goto verbatim; /* Reject values that exceed RFC 3629. */ if (c > 0x10FFFF) goto verbatim; /* Reject values that were encoded longer than necessary, so we don't * hide that fact in our output. (e.g. 0xC0 0x80 -> 0!) */ if (c < 0x80 || (n == 3 && c < 0x800) || (n == 4 && c < 0x10000)) goto verbatim; /* Successfully consumed the continuation bytes. */ *c_ret = c; return n; verbatim: *c_ret = (unsigned char) b; return 1; } /** Return a printable text string. * * Takes a string, and any ASCII characters that are not printable are * replaced by the corresponding escape sequence in the returned * string. * * @param outstr Output string pointer * @param in Input string pointer * @param inlen Maximum length of string to read not including terminating 0. * @param outlen Maximum length of string to return not including terminating 0. * 0 means MAXSTRINGLEN. * @param quoted Put double quotes around the string. If input string is truncated * in will have "..." after the second quote. * @param user Set this to indicate the input string pointer is a userspace pointer. */ static int _stp_text_str(char *outstr, const char *in, int inlen, int outlen, int quoted, int user, int buffer) { int c = 0; char *out = outstr; /* Points to the beginning of out's first escape sequence that could * be cut off by truncation. Remains NULL if no such escape sequence * has been found. */ char *esc = NULL; /* Length of the escape sequence pointed to by esc */ int esc_len = 0; if (inlen <= 0 || inlen > MAXSTRINGLEN-1) inlen = MAXSTRINGLEN-1; if (outlen <= 0 || outlen > MAXSTRINGLEN-1) outlen = MAXSTRINGLEN-1; if (quoted) { outlen = max(outlen, 5) - 2; *out++ = '"'; } while (inlen > 0) { int num = 1; int n = _stp_decode_utf8(in, inlen, user, &c); if (n <= 0) goto bad; if ((c == 0 && !buffer) || outlen <= 0) break; in += n; inlen -= n; if (n > 1) { /* UTF-8, print \uXXXX or \UXXXXXXXX */ int i; num = (c <= 0xFFFF) ? 6 : 10; if (!esc && outlen - 3 < num) esc = out, esc_len = num; if (outlen < num) break; *out++ = '\\'; *out++ = (c <= 0xFFFF) ? 'u' : 'U'; for (i = num - 3; i >= 0; --i) { char nibble = (c >> (i * 4)) & 0xF; *out++ = to_hex_digit(nibble); } } else if (isascii(c) && isprint(c) && c != '"' && c != '\\') /* quoteworthy characters */ *out++ = c; else { switch (c) { case '\a': case '\b': case '\f': case '\n': case '\r': case '\t': case '\v': case '"': case '\\': /* * Do not add '\0' to the short-escapes list. * If we emit \0 and the next char is a * printable digit, say "6", the resulting \06 * will be interpreted incorrectly as a * single-byte octal escape, not a null byte * then a printable digit. */ num = 2; // "\c" break; default: num = 4; // "\ooo" break; } if (!esc && outlen - 3 < num) esc = out, esc_len = num; if (outlen < num) break; *out++ = '\\'; switch (c) { case '\0': *out++ = '0'; break; case '\a': *out++ = 'a'; break; case '\b': *out++ = 'b'; break; case '\f': *out++ = 'f'; break; case '\n': *out++ = 'n'; break; case '\r': *out++ = 'r'; break; case '\t': *out++ = 't'; break; case '\v': *out++ = 'v'; break; case '"': *out++ = '"'; break; case '\\': *out++ = '\\'; break; default: /* output octal representation */ *out++ = to_oct_digit((c >> 6) & 03); *out++ = to_oct_digit((c >> 3) & 07); *out++ = to_oct_digit(c & 07); break; } } outlen -= num; } if (quoted) { if (c && inlen > 0) { char *truncptr = out - 3 + outlen; /* If truncating at truncptr would cut off part of an * escape sequence, then adjust truncptr so that the * entire sequence is removed instead. */ if (esc && esc < truncptr && truncptr < esc + esc_len) out = esc; else out = truncptr; *out++ = '"'; *out++ = '.'; *out++ = '.'; *out++ = '.'; } else *out++ = '"'; } *out = '\0'; return 0; bad: strlcpy (outstr, "", outlen); return -1; // PR15044 } /** * Convert a UTF-32 character into a UTF-8 string. * * @param buf The output buffer. * @param size The size of the output buffer. * @param c The character to convert. * * @return The number of bytes written (not counting \0), * 0 if there's not enough room for the full character, * or < 0 for invalid characters (with buf untouched). */ static int _stp_convert_utf32(char* buf, int size, u32 c) { int i, n; /* 0xxxxxxx */ if (c <= 0x7F) n = 1; /* 110xxxxx 10xxxxxx */ else if (c <= 0x7FF) n = 2; /* UTF-16 surrogates are not valid by themselves. * XXX We could decide to be lax and just encode it anyway... */ else if (c >= 0xD800 && c <= 0xDFFF) return -EINVAL; /* 1110xxxx 10xxxxxx 10xxxxxx */ else if (c <= 0xFFFF) n = 3; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ else if (c <= 0x10FFFF) n = 4; /* The original UTF-8 design could go up to 0x7FFFFFFF, but RFC 3629 * sets the upperbound to 0x10FFFF; thus all higher values are errors. */ else return -EINVAL; if (size < n + 1) return 0; buf[n] = '\0'; if (n == 1) buf[0] = c; else { u8 msb = ((1 << n) - 1) << (8 - n); for (i = n - 1; i > 0; --i) { buf[i] = 0x80 | (c & 0x3F); c >>= 6; } buf[0] = msb | c; } return n; } /** @} */ #endif /* _STP_STRING_C_ */