/* -*- linux-c -*- 
 * String Functions
 * Copyright (C) 2005-2018 Red Hat Inc.
 *
 * This file is part of systemtap, and is free software.  You can
 * redistribute it and/or modify it under the terms of the GNU General
 * Public License (GPL); either version 2, or (at your option) any
 * later version.
 */
#ifndef _STP_STRING_C_
#define _STP_STRING_C_

#include "stp_string.h"

/** @file stp_string.c
 * @brief Implements string functions.
 */
/** @addtogroup string String Functions
 *
 * @{
 */

/** Sprintf into a string.
 * Like printf, except output goes into a string.  
 *
 * NB: these are script language printf formatting directives, where
 * %d ints are 64-bits etc, so we can't use gcc level attribute printf
 * to type-check the arguments.
 *
 * @param str string
 * @param fmt A printf-style format string followed by a 
 * variable number of args.
 */

static int _stp_snprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = _stp_vsnprintf(buf,size,fmt,args);
        va_end(args);
        return i;
}

static int _stp_vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
	unsigned i = _stp_vsnprintf(buf,size,fmt,args);
	return (i >= size) ? (size - 1) : i;
}


/**
 * Decode a UTF-8 sequence into its codepoint.
 *
 * @param buf The input buffer.
 * @param size The size of the input buffer.
 * @param user Flag to mark user memory, vs kernel.
 * @param c_ret The return pointer for the codepoint.
 *
 * @return The number of bytes consumed,
 * 	   or -EFAULT for unreadable memory address.
 */
static int _stp_decode_utf8(const char* buf, int size, int user, int* c_ret)
{
	int c;
	char b = '\0';
	int i, n;

	if (size <= 0)
		return -EFAULT;

	if (_stp_deref_nofault(b, 1, buf, (user ? STP_USER_DS : STP_KERNEL_DS)))
		return -EFAULT;
	++buf;
	--size;

	if ((b & 0xE0) == 0xC0 && size >= 1) {
		/* 110xxxxx 10xxxxxx */
		/* Two-byte UTF-8, one more byte to read.  */
		n = 2;
		c = b & 0x1F;
	} else if ((b & 0xF0) == 0xE0 && size >= 2) {
		/* 1110xxxx 10xxxxxx 10xxxxxx */
		/* Three-byte UTF-8, two more bytes to read.  */
		n = 3;
		c = b & 0xF;
	} else if ((b & 0xF8) == 0xF0 && size >= 3) {
		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
		/* Four-byte UTF-8, three more bytes to read.  */
		n = 4;
		c = b & 0x7;
	} else {
		/* Return everything else verbatim, whether it's ASCII, longer
		 * UTF-8 (against RFC 3629), invalid UTF-8, or just not enough
		 * bytes left in the input buffer.  */
		goto verbatim;
	}

	/* Mix in the UTF-8 continuation bytes.  */
	for (i = 1; i < n; ++i) {
		char b2 = '\0';
		if (_stp_deref_nofault(b2, 1, buf, (user ? STP_USER_DS : STP_KERNEL_DS)))
			return -EFAULT;
		++buf;
		--size;

		if ((b2 & 0xC0) != 0x80) /* Bad continuation.  */
			goto verbatim;

		c = (c << 6) | (b2 & 0x3F);
	}


	/* Reject UTF-16 surrogates.  */
	if (0xD800 <= c && c <= 0xDFFF)
		goto verbatim;

	/* Reject values that exceed RFC 3629.  */
	if (c > 0x10FFFF)
		goto verbatim;

	/* Reject values that were encoded longer than necessary, so we don't
	 * hide that fact in our output.  (e.g. 0xC0 0x80 -> 0!)  */
	if (c < 0x80 || (n == 3 && c < 0x800) || (n == 4 && c < 0x10000))
		goto verbatim;

	/* Successfully consumed the continuation bytes.  */
	*c_ret = c;
	return n;

verbatim:
	*c_ret = (unsigned char) b;
	return 1;
}

/** Return a printable text string.
 *
 * Takes a string, and any ASCII characters that are not printable are
 * replaced by the corresponding escape sequence in the returned
 * string.
 *
 * @param outstr Output string pointer
 * @param in Input string pointer
 * @param inlen Maximum length of string to read not including terminating 0.
 * @param outlen Maximum length of string to return not including terminating 0.
 * 0 means MAXSTRINGLEN.
 * @param quoted Put double quotes around the string. If input string is truncated
 * in will have "..." after the second quote.
 * @param user Set this to indicate the input string pointer is a userspace pointer.
 */
static int _stp_text_str(char *outstr, const char *in, int inlen, int outlen,
			 int quoted, int user, int buffer)
{
	int c = 0;
	char *out = outstr;

        /* Points to the beginning of out's first escape sequence that could
         * be cut off by truncation. Remains NULL if no such escape sequence
         * has been found. */
        char *esc = NULL;
        /* Length of the escape sequence pointed to by esc */
        int esc_len = 0;

	if (inlen <= 0 || inlen > MAXSTRINGLEN-1)
		inlen = MAXSTRINGLEN-1;
	if (outlen <= 0 || outlen > MAXSTRINGLEN-1)
		outlen = MAXSTRINGLEN-1;
	if (quoted) {
		outlen = max(outlen, 5) - 2;
		*out++ = '"';
	}

	while (inlen > 0) {
		int num = 1;

		int n = _stp_decode_utf8(in, inlen, user, &c);
		if (n <= 0)
			goto bad;
		if ((c == 0 && !buffer) || outlen <= 0)
			break;
		in += n;
		inlen -= n;

		if (n > 1) {
			/* UTF-8, print \uXXXX or \UXXXXXXXX */
			int i;
			num = (c <= 0xFFFF) ? 6 : 10;

			if (!esc && outlen - 3 < num)
				esc = out, esc_len = num;

			if (outlen < num)
				break;

			*out++ = '\\';
			*out++ = (c <= 0xFFFF) ? 'u' : 'U';
			for (i = num - 3; i >= 0; --i) {
				char nibble = (c >> (i * 4)) & 0xF;
				*out++ = to_hex_digit(nibble);
			}

		}
		else if (isascii(c) && isprint(c)
				&& c != '"' && c != '\\') /* quoteworthy characters */
			*out++ = c;
		else {
			switch (c) {
			case '\a':
			case '\b':
			case '\f':
			case '\n':
			case '\r':
			case '\t':
			case '\v':
			case '"':
			case '\\':
				/*
				 * Do not add '\0' to the short-escapes list.
				 * If we emit \0 and the next char is a
				 * printable digit, say "6", the resulting \06
				 * will be interpreted incorrectly as a
				 * single-byte octal escape, not a null byte
				 * then a printable digit.
				 */
				num = 2; // "\c"
				break;
			default:
				num = 4; // "\ooo"
				break;
			}

			if (!esc && outlen - 3 < num)
				esc = out, esc_len = num;

			if (outlen < num)
				break;

			*out++ = '\\';
			switch (c) {
			case '\0':
                               *out++ = '0';
                               break;
			case '\a':
				*out++ = 'a';
				break;
			case '\b':
				*out++ = 'b';
				break;
			case '\f':
				*out++ = 'f';
				break;
			case '\n':
				*out++ = 'n';
				break;
			case '\r':
				*out++ = 'r';
				break;
			case '\t':
				*out++ = 't';
				break;
			case '\v':
				*out++ = 'v';
				break;
			case '"':
				*out++ = '"';
				break;
			case '\\':
				*out++ = '\\';
				break;
			default:	          /* output octal representation */
				*out++ = to_oct_digit((c >> 6) & 03);
				*out++ = to_oct_digit((c >> 3) & 07);
				*out++ = to_oct_digit(c & 07);
				break;
			}
		}
		outlen -= num;
	}

	if (quoted) {
		if (c && inlen > 0) {
                        char *truncptr = out - 3 + outlen;

                        /* If truncating at truncptr would cut off part of an
                         * escape sequence, then adjust truncptr so that the
                         * entire sequence is removed instead. */
			if (esc && esc < truncptr && truncptr < esc + esc_len)
				out = esc;
			else
				out = truncptr;

			*out++ = '"';
			*out++ = '.';
			*out++ = '.';
			*out++ = '.';
		} else
			*out++ = '"';

	}
	*out = '\0';
	return 0;
bad:
	strlcpy (outstr, "<unknown>", outlen);
	return -1; // PR15044
}

/**
 * Convert a UTF-32 character into a UTF-8 string.
 *
 * @param buf The output buffer.
 * @param size The size of the output buffer.
 * @param c The character to convert.
 *
 * @return The number of bytes written (not counting \0),
 *         0 if there's not enough room for the full character,
 *         or < 0 for invalid characters (with buf untouched).
 */
static int _stp_convert_utf32(char* buf, int size, u32 c)
{
	int i, n;

	/* 0xxxxxxx */
	if (c <= 0x7F)
		n = 1;

	/* 110xxxxx 10xxxxxx */
	else if (c <= 0x7FF)
		n = 2;

	/* UTF-16 surrogates are not valid by themselves.
	 * XXX We could decide to be lax and just encode it anyway...
	 */
	else if (c >= 0xD800 && c <= 0xDFFF)
		return -EINVAL;

	/* 1110xxxx 10xxxxxx 10xxxxxx */
	else if (c <= 0xFFFF)
		n = 3;

	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	else if (c <= 0x10FFFF)
		n = 4;

	/* The original UTF-8 design could go up to 0x7FFFFFFF, but RFC 3629
	 * sets the upperbound to 0x10FFFF; thus all higher values are errors.
	 */
	else
		return -EINVAL;

	if (size < n + 1)
		return 0;

	buf[n] = '\0';
	if (n == 1)
		buf[0] = c;
	else {
		u8 msb = ((1 << n) - 1) << (8 - n);
		for (i = n - 1; i > 0; --i) {
			buf[i] = 0x80 | (c & 0x3F);
			c >>= 6;
		}
		buf[0] = msb | c;
	}

	return n;
}

/** @} */
#endif /* _STP_STRING_C_ */