Hello and welcome to our community! Is this your first visit?
Register
Enjoy an ad free experience by logging in. Not a member yet? Register.
Results 1 to 7 of 7
  1. #1
    hgs
    hgs is offline
    New Coder
    Join Date
    Jan 2010
    Location
    Germany
    Posts
    83
    Thanks
    3
    Thanked 5 Times in 5 Posts

    simple parser / tokenizer

    Below you will find the code for a very simple parser.
    A working demo and all sources, can be found at http://hgsweb.de

    Code:
    <!DOCTYPE html>
    <html>
        <head>
            <title>Parser</title>
            <meta charset="UTF-8">       
        </head>
        <body>
            <h2>just enter something below and see how it is parsed.</h2>
            Source code for this small parser is within this page. Have fun!<p>
            <input id='pattern' type="text" size="80" maxlength="80" value="_word 'string' 333 3.14 !">
            <button type="button" onclick='parseIt();'>Parse</button>
            <div id="out"></div>
        </body>
    </html>
    <script>
        function myParser() {
            function parse_config()
            {
                var pf = {};
                pf.s2p = "";
                pf.offset = 0;
                pf.token = 0; //0=eof 1=char 2=num 3=float 4=word 5=string
                pf.a_out = '';
                pf.stop = '';
                pf.escape_char = "\\";
                pf.string_delim = "'|\"";
                pf.len = 0;
                pf.state = 0;
                return pf;
            }
            function parse_init(pf, astring)
            {
                pf.offset = 0;
                pf.len = astring.length;
                pf.s2p = astring;
                return pf;
            }
            function parse(pf)
            {
                var i, len, inword, innum, instring, cc, se, infloat;
                i = pf.offset;
                len = pf.len;
                inword = false;
                innum = false;
                instring = false;
                if (i >= len) {
                    pf.token = 0;
                    return pf;
                }
                for (; i < len; i++) {
                    cc = pf.s2p.charAt(i);
                    if (cc <= ' ' || cc === '`') {
                        continue;
                    }
                    if (((cc >= "A" && cc <= "Z") || (cc >= "a" && cc <= "z") || cc === "_")) {
                        inword = true;
                        break;
                    }
                    if (((cc >= "0" && cc <= "9"))) {
                        innum = true;
                        break;
                    }
                    if (cc === "'" || cc === '"') {
                        se = cc;
                        pf.stop = se;
                        instring = true;
                        break;
                    }
                    pf.a_out = cc;
                    pf.offset = i + 1;
                    pf.token = 1;
                    return pf;
                }
                if (i >= len) {
                    pf.token = 0;
                    pf.offset = len;
                    return pf;
                }
                if (instring) {
                    pf.a_out = "";
                    pf.token = 5;
                    for (i++; i < len; i++) {
                        cc = pf.s2p.charAt(i);
                        if (cc === pf.escape_char) {
                            i++;
                            pf.a_out += pf.s2p.charAt(i);
                            continue;
                        }
                        if (cc === se) {
                            pf.offset = i + 1;
                            return pf;
                        }
                        pf.a_out += cc;
                    }
                    pf.offset = i;
                    return pf;
                }
                if (inword) {
                    pf.a_out = cc;
                    for (i++; i < len; i++) {
                        cc = pf.s2p.charAt(i);
                        if (((cc >= "A" && cc <= "Z") || (cc >= "a" && cc <= "z") || (cc >= "0" && cc <= "9") || cc === "_")) {
                            pf.a_out += cc;
                        } else {
                            pf.offset = i;
                            pf.token = 4;
                            return pf;
                        }
                    }
                    pf.offset = i;
                    pf.token = 4;
                    return pf;
                }
                if (innum) {
                    pf.a_out = cc;
                    pf.token = 2;
                    for (i++; i < len; i++) {
                        cc = pf.s2p.charAt(i);
                        if ((cc >= "0" && cc <= "9")) {
                            pf.a_out += cc;
                        } else {
                            if (cc === "." && !infloat) {
                                infloat = true;
                                pf.a_out += cc;
                                pf.token = 3;
                            } else {
                                pf.offset = i;
                                return pf;
                            }
                        }
                    }
                    pf.offset = i;
                    return pf;
                }
            }
            return {
                parseConfig: parse_config,
                parseInit: parse_init,
                parse: parse
            };
        }
        function parseIt() {
            parser = myParser();
            pf = parser.parseConfig();
            pf = parser.parseInit(pf, document.getElementById('pattern').value);
            pf = parser.parse(pf);
            document.getElementById('out').innerHTML = 'token values ; 1=char 2=num 3=float 4=word 5=string</p>';
            while (pf.token !== 0) {
                document.getElementById('out').innerHTML += ('token=' + pf.token + ' pattern: ' + pf.a_out + '<br>');
                pf = parser.parse(pf);
            }
        }
    </script>
    My site
    If you’re doing software development right, you’re probably doing Agile wrong.
    -- Isaac Schlueter

  • #2
    Moderator
    Join Date
    May 2002
    Location
    Hayward, CA
    Posts
    1,461
    Thanks
    1
    Thanked 23 Times in 21 Posts
    I might have something better. Certainly more suited to my needs:

    http://sourceforge.net/p/verbosio/te...kenIterator.js

    Tests are at http://sourceforge.net/p/verbosio/te...kenIterator.js .

    The commit message says it best: "Implement a quick & dirty 'TokenIterator' class for fast string searching, and skipping over substrings we don't care about."

    Yours does go character by character, so with just-in-time compilation it might run faster than my code. A few comments would've been helpful. (Also, because I use Map(), a hashtable for JS that isn't available in IE yet, mine's not cross-platform.)
    "The first step to confirming there is a bug in someone else's work is confirming there are no bugs in your own."
    June 30, 2001
    author, Verbosio prototype XML Editor
    author, JavaScript Developer's Dictionary
    https://alexvincent.us/blog

  • #3
    hgs
    hgs is offline
    New Coder
    Join Date
    Jan 2010
    Location
    Germany
    Posts
    83
    Thanks
    3
    Thanked 5 Times in 5 Posts
    Thanks for commenting.
    The point 'A few comments would've been helpful' is taken.

    Now, I looked at your code and I have no clue how that might work.

    Can you give me a small example on how it is used, called ?

    With that I could debug into it in order to understand
    how it work ?

    Regards
    My site
    If you’re doing software development right, you’re probably doing Agile wrong.
    -- Isaac Schlueter

  • #4
    Senior Coder rnd me's Avatar
    Join Date
    Jun 2007
    Location
    Urbana
    Posts
    4,373
    Thanks
    11
    Thanked 592 Times in 572 Posts
    here's a REALLY simple one for CSV files:


    Code:
    function parseCSV(str){
    var  stack=[], ob=[], float="", inQuotes=false;
    for( var i=0; i<str.length;i++){
      var it=str[i];
        if(it==='"'){  inQuotes=!inQuotes; if(str[i-1]==='"'){float+='"';} continue; }
        if(it===","){ if(!inQuotes){ob.push(float); float="";}else{float+=",";} continue;}
        if(it==="\n"){ ob.push(float); stack.push(ob); float=""; ob=[]; continue;}
        float+=it;
      }
      ob.push(float); stack.push(ob);
    return stack;
    }
    my site (updated 13/9/26)
    BROWSER STATS [% share] (2014/9/03) IE7:0.1, IE8:4.6, IE11:9.1, IE9:3.1, IE10:3.0, FF:17.2, CH:46, SF:11.4, NON-MOUSE:38%

  • #5
    Moderator
    Join Date
    May 2002
    Location
    Hayward, CA
    Posts
    1,461
    Thanks
    1
    Thanked 23 Times in 21 Posts
    Quote Originally Posted by hgs View Post
    Thanks for commenting.
    The point 'A few comments would've been helpful' is taken.

    Now, I looked at your code and I have no clue how that might work.

    Can you give me a small example on how it is used, called ?

    With that I could debug into it in order to understand
    how it work ?

    Regards

    That's a fair request. Basically, if I call:
    Code:
    var QuoteIterator;
    QuoteIterator = new TokenIterator();
    QuoteIterator.addPair('"', '"'); // skip over anything contained in double quotes
    QuoteIterator.addPair("'", "'"); // skip over anything contained in single quotes
    QuoteIterator.addCallback("&#", function(iter) { return iter.position; }); // continue after the "&#" characters
    QuoteIterator.addCallback("&", function(iter) {
      // If we hit an ampersand, do something else.
      var advance = doSomethingElse();
      iter.stopReading();
      return iter.position + advance;
    });
    
    QuoteIterator.startReading("  '&hello;'  \"&goodbye;\"  &#a0   ", 0); // start reading the string at character 0.
    The first rule says anything in double quotes is ignored, so we skip over &goodbye;.
    The second rule says anything in single quotes is ignored, so we skip over &hello;.
    The third rule says an ampersand followed by a pound sign ("&#") is ignored.
    The fourth rule (which never gets invoked because the first three take precedence) says to do something else when we hit just an ampersand.

    Does that help? The ordering of rules is significant, and I can skip over large swaths of strings that I really don't care about parsing.

    Incidentally, I wrote this code last night because I realized I had a pressing need; it wasn't in response to your submission.
    Last edited by Alex Vincent; 10-11-2013 at 12:19 AM.
    "The first step to confirming there is a bug in someone else's work is confirming there are no bugs in your own."
    June 30, 2001
    author, Verbosio prototype XML Editor
    author, JavaScript Developer's Dictionary
    https://alexvincent.us/blog

  • #6
    hgs
    hgs is offline
    New Coder
    Join Date
    Jan 2010
    Location
    Germany
    Posts
    83
    Thanks
    3
    Thanked 5 Times in 5 Posts
    Thank you for the example.
    It gives me now an idea on how this is supposed to work.

    Unfortunately I can't execute the given example because in Chrome and
    IE10 this fails because of Uncaught ReferenceError: Map is not defined .

    And IE10 laments about const l = data.length; Syntaxerror.

    Regards
    My site
    If you’re doing software development right, you’re probably doing Agile wrong.
    -- Isaac Schlueter

  • #7
    Moderator
    Join Date
    May 2002
    Location
    Hayward, CA
    Posts
    1,461
    Thanks
    1
    Thanked 23 Times in 21 Posts
    I expected the Map failure. Since the first argument is always a string, you could use a JS object as a dictionary. ({"foo": "bar", "baz": function() {/*...*/}}).

    For const, var should do just fine.
    "The first step to confirming there is a bug in someone else's work is confirming there are no bugs in your own."
    June 30, 2001
    author, Verbosio prototype XML Editor
    author, JavaScript Developer's Dictionary
    https://alexvincent.us/blog


  •  

    Posting Permissions

    • You may not post new threads
    • You may not post replies
    • You may not post attachments
    • You may not edit your posts
    •