mocha/dotnet/Applications/Mocha.Web.Server/Core/Parser/Tokenizer.cs

using System.Diagnostics;

namespace dotless.Core.Parser
{
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using Exceptions;
    using Infrastructure.Nodes;
    using Utils;

    [DebuggerDisplay("{Remaining}")]
    public class Tokenizer
    {
        public int Optimization { get; set; }

        private string _input; // LeSS input string
        private List<Chunk> _chunks; // chunkified input
        private int _i; // current index in `input`
        private int _j; // current chunk
        private int _current; // index of current chunk, in `input`
        private int _lastCommentStart = -1; // the start of the last collection of comments
        private int _lastCommentEnd = -1; // the end of the last collection of comments
        private int _inputLength;
        private readonly string _commentRegEx = @"(//[^\n]*|(/\*(.|[\r\n])*?\*/))";
        private readonly string _quotedRegEx = @"(""((?:[^""\\\r\n]|\\.)*)""|'((?:[^'\\\r\n]|\\.)*)')";
        private string _fileName;

        //Increasing throughput through tracing of Regex
        private IDictionary<string, Regex> regexCache = new Dictionary<string, Regex>();

        public Tokenizer(int optimization)
        {
            Optimization = optimization;
        }

        public void SetupInput(string input, string fileName)
        {
            _fileName = fileName;
            _i = _j = _current = 0;
            _chunks = new List<Chunk>();
            _input = input.Replace("\r\n", "\n");
            _inputLength = _input.Length;

            // Split the input into chunks,
            // Either delimited by /\n\n/ or
            // delmited by '\n}' (see rationale above),
            // depending on the level of optimization.

            if(Optimization == 0)
                _chunks.Add(new Chunk(_input));
            else
            {
                var skip = new Regex(@"\G(@\{[a-zA-Z0-9_-]+\}|[^\""'{}/\\\(\)]+)");

                var comment = GetRegex(this._commentRegEx, RegexOptions.None);
                var quotedstring = GetRegex(this._quotedRegEx, RegexOptions.None);
                var level = 0;
                var lastBlock = 0;
                var inParam = false;

                int i = 0;
                while(i < _inputLength)
                {
                    var match = skip.Match(_input, i);
                    if(match.Success)
                    {
                        Chunk.Append(match.Value, _chunks);
                        i += match.Length;
                        continue;
                    }

                    var c = _input[i];

                    if(i < _inputLength - 1 && c == '/')
                    {
                        var cc = _input[i + 1];
                        if ((!inParam && cc == '/') || cc == '*')
                        {
                            match = comment.Match(_input, i);
                            if(match.Success)
                            {
                                i += match.Length;
                                _chunks.Add(new Chunk(match.Value, ChunkType.Comment));
                                continue;
                            } else
                            {
                                throw new ParsingException("Missing closing comment", GetNodeLocation(i));
                            }
                        }
                    }

                    if(c == '"' || c == '\'')
                    {
                        match = quotedstring.Match(_input, i);
                        if(match.Success)
                        {
                            i += match.Length;
                            _chunks.Add(new Chunk(match.Value, ChunkType.QuotedString));
                            continue;
                        } else
                        {
                            throw new ParsingException(string.Format("Missing closing quote ({0})", c), GetNodeLocation(i));
                        }
                    }

                    // we are not in a quoted string or comment - process '{' level
                    if(!inParam && c == '{')
                    {
                        level++;
                        lastBlock = i;
                    }
                    else if (!inParam && c == '}')
                    {
                        level--;

                        if(level < 0)
                            throw new ParsingException("Unexpected '}'", GetNodeLocation(i));

                        Chunk.Append(c, _chunks, true);
                        i++;
                        continue;
                    } if (c == '(')
                    {
                        inParam = true;
                    }
                    else if (c == ')')
                    {
                        inParam = false;
                    }

                    Chunk.Append(c, _chunks);
                    i++;
                }

                if(level > 0)
                    throw new ParsingException("Missing closing '}'", GetNodeLocation(lastBlock));

                _input =  Chunk.CommitAll(_chunks);

                _inputLength = _input.Length;
            }

            Advance(0); // skip any whitespace characters at the start.
        }

        public string GetComment()
        {
            // if we've hit the end we might still be looking at a valid chunk, so return early
            if (_i == _inputLength) {
                return null;
            }

            string val;
            int startI = _i;
            int endI = 0;

            if  (Optimization == 0)
            {
                if (this.CurrentChar != '/')
                    return null;

                var comment = this.Match(this._commentRegEx);
                if (comment == null)
                {
                    return null;
                }
                val = comment.Value;
                endI = startI + comment.Value.Length;
            }
            else
            {
                if (_chunks[_j].Type == ChunkType.Comment)
                {
                    val = _chunks[_j].Value;
                    endI = _i + _chunks[_j].Value.Length;
                    Advance(_chunks[_j].Value.Length);
                }
                else
                {
                    return null;
                }
            }

            if (_lastCommentEnd != startI)
            {
                _lastCommentStart = startI;
            }

            _lastCommentEnd = endI;

            return val;
        }

        public string GetQuotedString()
        {
            // if we've hit the end we might still be looking at a valid chunk, so return early
            if (_i == _inputLength) {
                return null;
            }

            if (Optimization == 0) {
                if (this.CurrentChar != '"' && this.CurrentChar != '\'')
                    return null;

                var quotedstring = this.Match(this._quotedRegEx);
                return quotedstring.Value;
            } else {
                if (_chunks[_j].Type == ChunkType.QuotedString) {
                    string val = _chunks[_j].Value;
                    Advance(_chunks[_j].Value.Length);
                    return val;
                }
            }
            return null;
        }

        public string MatchString(char tok)
        {
            var c = Match(tok);

            return c == null ? null : c.Value;
        }

        public string MatchString(string tok)
        {
            var match = Match(tok);

            return match == null ? null : match.Value;
        }

        //
        // Parse from a token, regexp or string, and move forward if match
        //

        public CharMatchResult Match(char tok)
        {
            if  (_i == _inputLength || _chunks[_j].Type != ChunkType.Text) {
                return null;
            }

            if (_input[_i] == tok)
            {
                var index = _i;

                Advance(1);

                return new CharMatchResult(tok) { Location = GetNodeLocation(index) };
            }

            return null;
        }


        public RegexMatchResult Match(string tok)
        {
            return Match(tok, false);
        }

        public RegexMatchResult Match(string tok, bool caseInsensitive)
        {
            if (_i == _inputLength || _chunks[_j].Type != ChunkType.Text) {
                return null;
            }

            var options = RegexOptions.None;
            if (caseInsensitive)
                options |= RegexOptions.IgnoreCase;

            var regex = GetRegex(tok, options);

            var match = regex.Match(_chunks[_j].Value, _i - _current);

            if (!match.Success)
                return null;

            var index = _i;

            Advance(match.Length);

            return new RegexMatchResult(match) {Location = GetNodeLocation(index)};
        }

        // Match a string, but include the possibility of matching quoted and comments
        public RegexMatchResult MatchAny(string tok)
        {
            if (_i == _inputLength) {
                return null;
            }

            var regex = GetRegex(tok, RegexOptions.None);

            var match = regex.Match(_input, _i);

            if (!match.Success)
                return null;

            Advance(match.Length);

            if (_i > _current && _i < _current + _chunks[_j].Value.Length)
            {
                //If we absorbed the start of an inline comment then turn it into text so the rest can be absorbed
                if (_chunks[_j].Type == ChunkType.Comment && _chunks[_j].Value.StartsWith("//"))
                {
                    _chunks[_j].Type = ChunkType.Text;
                }
            }

            return new RegexMatchResult(match);
        }

        public void Advance(int length)
        {
            if (_i == _inputLength) //only for empty cases as there may not be any chunks
                return;

            // The match is confirmed, add the match length to `i`,
            // and consume any extra white-space characters (' ' || '\n')
            // which come after that. The reason for this is that LeSS's
            // grammar is mostly white-space insensitive.
            _i += length;
            var endIndex = _current + _chunks[_j].Value.Length;

            while (true)
            {
                if(_i == _inputLength)
                    break;

                if (_i >= endIndex)
                {
                    if (_j < _chunks.Count - 1)
                    {
                        _current = endIndex;
                        endIndex += _chunks[++_j].Value.Length;
                        continue; // allow skipping multiple chunks
                    }
                    else
                        break;
                }

                if (!char.IsWhiteSpace(_input[_i]))
                    break;

                _i++;
            }
        }

        // Same as Match, but don't change the state of the parser,
        // just return the match.

        public bool Peek(char tok)
        {
            if (_i == _inputLength)
                return false;

            return _input[_i] == tok;
        }

        public bool Peek(string tok)
        {
            var regex = GetRegex(tok, RegexOptions.None);

            var match = regex.Match(_input, _i);

            return match.Success;
        }

        public bool PeekAfterComments(char tok)
        {
            var memo = this.Location;

            while(GetComment() != null);

            var peekSuccess = Peek(tok);

            this.Location = memo;

            return peekSuccess;
        }

        private Regex GetRegex(string pattern, RegexOptions options)
        {
            if (!regexCache.ContainsKey(pattern))
                regexCache.Add(pattern, new Regex(@"\G" + pattern, options));

            return regexCache[pattern];
        }

        public char GetPreviousCharIgnoringComments()
        {
            if  (_i == 0) {
                return '\0';
            }

            if  (_i != _lastCommentEnd) {
                return PreviousChar;
            }

            int i = _lastCommentStart - 1;

            if  (i < 0) {
                return '\0';
            }

            return _input[i];
        }

        public char PreviousChar
        {
            get { return _i == 0 ? '\0' : _input[_i - 1]; }
        }

        public char CurrentChar
        {
            get { return _i == _inputLength ? '\0' : _input[_i]; }
        }

        public char NextChar
        {
            get { return _i + 1 == _inputLength ? '\0' : _input[_i + 1]; }
        }

        public bool HasCompletedParsing()
        {
            return _i == _inputLength;
        }

        public Location Location
        {
            get
            {
                return new Location
                {
                    Index = _i,
                    CurrentChunk = _j,
                    CurrentChunkIndex = _current
                };
            }
            set
            {
                _i = value.Index;
                _j = value.CurrentChunk;
                _current = value.CurrentChunkIndex;
            }
        }

        public NodeLocation GetNodeLocation(int index)
        {
            return new NodeLocation(index, this._input, this._fileName);
        }

        public NodeLocation GetNodeLocation()
        {
            return GetNodeLocation(this.Location.Index);
        }

        private enum ChunkType
        {
            Text,
            Comment,
            QuotedString
        }

        private class Chunk
        {
            private StringBuilder _builder;

            public Chunk(string val)
            {
                Value = val;
                Type = ChunkType.Text;
            }

            public Chunk(string val, ChunkType type)
            {
                Value = val;
                Type = type;
            }

            public Chunk()
            {
                _builder = new StringBuilder();
                Type = ChunkType.Text;
            }

            public ChunkType Type { get; set; }

            public string Value { get; set; }

            private bool _final;

            public void Append(string str)
            {
                _builder.Append(str);
            }

            public void Append(char c)
            {
                _builder.Append(c);
            }

            private static Chunk ReadyForText(List<Chunk> chunks)
            {
                Chunk last = chunks.LastOrDefault();
                if  (last == null || last.Type != ChunkType.Text || last._final == true)
                {
                    last = new Chunk();
                    chunks.Add(last);
                }
                return last;
            }

            public static void Append(char c, List<Chunk> chunks, bool final)
            {
                Chunk chunk = ReadyForText(chunks);
                chunk.Append(c);
                chunk._final = final;
            }

            public static void Append(char c, List<Chunk> chunks)
            {
                Chunk chunk = ReadyForText(chunks);
                chunk.Append(c);
            }

            public static void Append(string s, List<Chunk> chunks)
            {
                Chunk chunk = ReadyForText(chunks);
                chunk.Append(s);
            }

            public static string CommitAll(List<Chunk> chunks)
            {
                StringBuilder all = new StringBuilder();
                foreach(Chunk chunk in chunks)
                {
                    if  (chunk._builder != null)
                    {
                        string val = chunk._builder.ToString();
                        chunk._builder = null;
                        chunk.Value = val;
                    }

                    all.Append(chunk.Value);
                }
                return all.ToString();
            }
        }

        private string Remaining
        {
            get { return _input.Substring(_i); }
        }
    }

    public class Location
    {
        public int Index { get; set; }
        public int CurrentChunk { get; set; }
        public int CurrentChunkIndex { get; set; }
    }
}