/src/mosh/src/terminal/parser.cc

Source
/*
    Mosh: the mobile shell
    Copyright 2012 Keith Winstein

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

    In addition, as a special exception, the copyright holders give
    permission to link the code of portions of this program with the
    OpenSSL library under certain conditions as described in each
    individual source file, and distribute linked combinations including
    the two.

    You must obey the GNU General Public License in all respects for all
    of the code used other than OpenSSL. If you modify file(s) with this
    exception, you may extend this exception to your version of the
    file(s), but you are not obligated to do so. If you do not wish to do
    so, delete this exception statement from your version. If you delete
    this exception statement from all source files in the program, then
    also delete it here.
*/

#include <cassert>
#include <cerrno>
#include <cstdint>
#include <cwchar>
#include <typeinfo>

#include "src/terminal/parser.h"

const Parser::StateFamily Parser::family;

static void append_or_delete( Parser::ActionPointer act, Parser::Actions& vec )
{
  assert( act );

  if ( !act->ignore() ) {
    vec.push_back( act );
  }
}

void Parser::Parser::input( wchar_t ch, Actions& ret )
{
  Transition tx = state->input( ch );

  if ( tx.next_state != NULL ) {
    append_or_delete( state->exit(), ret );
  }

  append_or_delete( tx.action, ret );

  if ( tx.next_state != NULL ) {
    append_or_delete( tx.next_state->enter(), ret );
    state = tx.next_state;
  }
}

Parser::UTF8Parser::UTF8Parser() : parser(), buf_len( 0 )
{
  assert( BUF_SIZE >= (size_t)MB_CUR_MAX );
  buf[0] = '\0';
}

void Parser::UTF8Parser::input( char c, Actions& ret )
{
  assert( buf_len < BUF_SIZE );

  /* 1-byte UTF-8 character, aka ASCII?  Cheat. */
  if ( buf_len == 0 && static_cast<unsigned char>( c ) <= 0x7f ) {
    parser.input( static_cast<wchar_t>( c ), ret );
    return;
  }

  buf[buf_len++] = c;

  /* This function will only work in a UTF-8 locale. */
  wchar_t pwc;
  mbstate_t ps = mbstate_t();

  size_t total_bytes_parsed = 0;
  size_t orig_buf_len = buf_len;

  /* this routine is somewhat complicated in order to comply with
     Unicode 6.0, section 3.9, "Best Practices for using U+FFFD" */

  while ( total_bytes_parsed != orig_buf_len ) {
    assert( total_bytes_parsed < orig_buf_len );
    assert( buf_len > 0 );
    size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps );

    /* this returns 0 when n = 0! */

    if ( bytes_parsed == 0 ) {
      /* character was NUL, accept and clear buffer */
      assert( buf_len == 1 );
      buf_len = 0;
      pwc = L'\0';
      bytes_parsed = 1;
    } else if ( bytes_parsed == (size_t)-1 ) {
      /* invalid sequence, use replacement character and try again with last char */
      assert( errno == EILSEQ );
      if ( buf_len > 1 ) {
        buf[0] = buf[buf_len - 1];
        bytes_parsed = buf_len - 1;
        buf_len = 1;
      } else {
        buf_len = 0;
        bytes_parsed = 1;
      }
      pwc = (wchar_t)0xFFFD;
    } else if ( bytes_parsed == (size_t)-2 ) {
      /* can't parse incomplete multibyte character */
      total_bytes_parsed += buf_len;
      continue;
    } else {
      /* parsed into pwc, accept */
      assert( bytes_parsed <= buf_len );
      memmove( buf, buf + bytes_parsed, buf_len - bytes_parsed );
      buf_len = buf_len - bytes_parsed;
    }

    /* Cast to unsigned for checks, because some
       platforms (e.g. ARM) use uint32_t as wchar_t,
       causing compiler warning on "pwc > 0" check. */
    const uint32_t pwcheck = pwc;

    if ( pwcheck > 0x10FFFF ) { /* outside Unicode range */
      pwc = (wchar_t)0xFFFD;
    }

    if ( ( pwcheck >= 0xD800 ) && ( pwcheck <= 0xDFFF ) ) { /* surrogate code point */
      /*
        OS X unfortunately allows these sequences without EILSEQ, but
        they are ill-formed UTF-8 and we shouldn't repeat them to the
        user's terminal.
      */
      pwc = (wchar_t)0xFFFD;
    }

    parser.input( pwc, ret );

    total_bytes_parsed += bytes_parsed;
  }
}

Parser::Parser::Parser( const Parser& other ) : state( other.state ) {}

Parser::Parser& Parser::Parser::operator=( const Parser& other )
{
  state = other.state;
  return *this;
}

Line	Count	Source
1		/*
2		Mosh: the mobile shell
3		Copyright 2012 Keith Winstein
4
5		This program is free software: you can redistribute it and/or modify
6		it under the terms of the GNU General Public License as published by
7		the Free Software Foundation, either version 3 of the License, or
8		(at your option) any later version.
9
10		This program is distributed in the hope that it will be useful,
11		but WITHOUT ANY WARRANTY; without even the implied warranty of
12		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13		GNU General Public License for more details.
14
15		You should have received a copy of the GNU General Public License
16		along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18		In addition, as a special exception, the copyright holders give
19		permission to link the code of portions of this program with the
20		OpenSSL library under certain conditions as described in each
21		individual source file, and distribute linked combinations including
22		the two.
23
24		You must obey the GNU General Public License in all respects for all
25		of the code used other than OpenSSL. If you modify file(s) with this
26		exception, you may extend this exception to your version of the
27		file(s), but you are not obligated to do so. If you do not wish to do
28		so, delete this exception statement from your version. If you delete
29		this exception statement from all source files in the program, then
30		also delete it here.
31		*/
32
33		#include <cassert>
34		#include <cerrno>
35		#include <cstdint>
36		#include <cwchar>
37		#include <typeinfo>
38
39		#include "src/terminal/parser.h"
40
41		const Parser::StateFamily Parser::family;
42
43		static void append_or_delete( Parser::ActionPointer act, Parser::Actions& vec )
44	11.3M	{
45	11.3M	assert( act );
46
47	11.3M	if ( !act->ignore() ) {
48	8.68M	vec.push_back( act );
49	8.68M	}
50	11.3M	}
51
52		void Parser::Parser::input( wchar_t ch, Actions& ret )
53	7.82M	{
54	7.82M	Transition tx = state->input( ch );
55
56	7.82M	if ( tx.next_state != NULL ) {
57	1.75M	append_or_delete( state->exit(), ret );
58	1.75M	}
59
60	7.82M	append_or_delete( tx.action, ret );
61
62	7.82M	if ( tx.next_state != NULL ) {
63	1.75M	append_or_delete( tx.next_state->enter(), ret );
64	1.75M	state = tx.next_state;
65	1.75M	}
66	7.82M	}
67
68	1.31k	Parser::UTF8Parser::UTF8Parser() : parser(), buf_len( 0 )
69	1.31k	{
70	1.31k	assert( BUF_SIZE >= (size_t)MB_CUR_MAX );
71	1.31k	buf[0] = '\0';
72	1.31k	}
73
74		void Parser::UTF8Parser::input( char c, Actions& ret )
75	7.82M	{
76	7.82M	assert( buf_len < BUF_SIZE );
77
78		/* 1-byte UTF-8 character, aka ASCII? Cheat. */
79	7.82M	if ( buf_len == 0 && static_cast<unsigned char>( c ) <= 0x7f ) {
80	7.04M	parser.input( static_cast<wchar_t>( c ), ret );
81	7.04M	return;
82	7.04M	}
83
84	784k	buf[buf_len++] = c;
85
86		/* This function will only work in a UTF-8 locale. */
87	784k	wchar_t pwc;
88	784k	mbstate_t ps = mbstate_t();
89
90	784k	size_t total_bytes_parsed = 0;
91	784k	size_t orig_buf_len = buf_len;
92
93		/* this routine is somewhat complicated in order to comply with
94		Unicode 6.0, section 3.9, "Best Practices for using U+FFFD" */
95
96	1.56M	while ( total_bytes_parsed != orig_buf_len ) {
97	784k	assert( total_bytes_parsed < orig_buf_len );
98	784k	assert( buf_len > 0 );
99	784k	size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps );
100
101		/* this returns 0 when n = 0! */
102
103	784k	if ( bytes_parsed == 0 ) {
104		/* character was NUL, accept and clear buffer */
105	0	assert( buf_len == 1 );
106	0	buf_len = 0;
107	0	pwc = L'\0';
108	0	bytes_parsed = 1;
109	784k	} else if ( bytes_parsed == (size_t)-1 ) {
110		/* invalid sequence, use replacement character and try again with last char */
111	784k	assert( errno == EILSEQ );
112	784k	if ( buf_len > 1 ) {
113	0	buf[0] = buf[buf_len - 1];
114	0	bytes_parsed = buf_len - 1;
115	0	buf_len = 1;
116	784k	} else {
117	784k	buf_len = 0;
118	784k	bytes_parsed = 1;
119	784k	}
120	784k	pwc = (wchar_t)0xFFFD;
121	784k	} else if ( bytes_parsed == (size_t)-2 ) {
122		/* can't parse incomplete multibyte character */
123	0	total_bytes_parsed += buf_len;
124	0	continue;
125	0	} else {
126		/* parsed into pwc, accept */
127	0	assert( bytes_parsed <= buf_len );
128	0	memmove( buf, buf + bytes_parsed, buf_len - bytes_parsed );
129	0	buf_len = buf_len - bytes_parsed;
130	0	}
131
132		/* Cast to unsigned for checks, because some
133		platforms (e.g. ARM) use uint32_t as wchar_t,
134		causing compiler warning on "pwc > 0" check. */
135	784k	const uint32_t pwcheck = pwc;
136
137	784k	if ( pwcheck > 0x10FFFF ) { /* outside Unicode range */
138	0	pwc = (wchar_t)0xFFFD;
139	0	}
140
141	784k	if ( ( pwcheck >= 0xD800 ) && ( pwcheck <= 0xDFFF ) ) { /* surrogate code point */
142		/*
143		OS X unfortunately allows these sequences without EILSEQ, but
144		they are ill-formed UTF-8 and we shouldn't repeat them to the
145		user's terminal.
146		*/
147	0	pwc = (wchar_t)0xFFFD;
148	0	}
149
150	784k	parser.input( pwc, ret );
151
152	784k	total_bytes_parsed += bytes_parsed;
153	784k	}
154	784k	}
155
156	0	Parser::Parser::Parser( const Parser& other ) : state( other.state ) {}
157
158		Parser::Parser& Parser::Parser::operator=( const Parser& other )
159	0	{
160	0	state = other.state;
161	0	return *this;
162	0	}

Coverage Report

Created: 2026-06-15 06:22