From ea8406e36e98a78819129566b0b1414bbc4f800c Mon Sep 17 00:00:00 2001
From: Dylan Thacker-Smith <Dylan.Smith@shopify.com>
Date: Thu, 27 Feb 2014 15:19:27 -0500
Subject: [PATCH] Create a Liquid::Tokenizer class in the C extension.

---
 Rakefile                      |   1 +
 ext/liquid/liquid_ext.c       |   7 ++-
 ext/liquid/liquid_ext.h       |   8 +++
 ext/liquid/tokenizer.c        | 115 ++++++++++++++++++++++++++++++++++
 ext/liquid/tokenizer.h        |  26 ++++++++
 lib/liquid.rb                 |   4 --
 lib/liquid/template.rb        |   9 +--
 test/liquid/template_test.rb  |  20 ------
 test/liquid/tokenizer_test.rb |  64 +++++++++++++++++++
 9 files changed, 220 insertions(+), 34 deletions(-)
 create mode 100644 ext/liquid/liquid_ext.h
 create mode 100644 ext/liquid/tokenizer.c
 create mode 100644 ext/liquid/tokenizer.h
 create mode 100644 test/liquid/tokenizer_test.rb

diff --git a/Rakefile b/Rakefile
index b7f8f30..9371c5a 100755
--- a/Rakefile
+++ b/Rakefile
@@ -84,3 +84,4 @@ end
 Rake::ExtensionTask.new "liquid" do |ext|
   ext.lib_dir = "lib/liquid"
 end
+Rake::Task[:test].prerequisites << :compile
diff --git a/ext/liquid/liquid_ext.c b/ext/liquid/liquid_ext.c
index 4d7a47d..5dab635 100644
--- a/ext/liquid/liquid_ext.c
+++ b/ext/liquid/liquid_ext.c
@@ -1,6 +1,9 @@
-#include "ruby.h"
+#include "liquid_ext.h"
+
+VALUE mLiquid;
 
 void Init_liquid(void)
 {
-    VALUE mLiquid = rb_define_module("Liquid");
+    mLiquid = rb_define_module("Liquid");
+    init_liquid_tokenizer();
 }
diff --git a/ext/liquid/liquid_ext.h b/ext/liquid/liquid_ext.h
new file mode 100644
index 0000000..065a90f
--- /dev/null
+++ b/ext/liquid/liquid_ext.h
@@ -0,0 +1,8 @@
+#ifndef LIQUID_EXT_H
+#define LIQUID_EXT_H
+
+#include <ruby.h>
+#include <stdbool.h>
+#include "tokenizer.h"
+
+#endif
diff --git a/ext/liquid/tokenizer.c b/ext/liquid/tokenizer.c
new file mode 100644
index 0000000..096306f
--- /dev/null
+++ b/ext/liquid/tokenizer.c
@@ -0,0 +1,115 @@
+#include "liquid_ext.h"
+
+VALUE cLiquidTokenizer;
+extern VALUE mLiquid;
+
+static void free_tokenizer(void *ptr)
+{
+    struct liquid_tokenizer *tokenizer;
+    xfree(tokenizer);
+}
+
+static VALUE rb_allocate(VALUE klass)
+{
+    VALUE obj;
+    struct liquid_tokenizer *tokenizer;
+
+    obj = Data_Make_Struct(klass, struct liquid_tokenizer, NULL, free_tokenizer, tokenizer);
+    return obj;
+}
+
+static VALUE rb_initialize(VALUE self, VALUE source)
+{
+    struct liquid_tokenizer *tokenizer;
+
+    Check_Type(source, T_STRING);
+    Data_Get_Struct(self, struct liquid_tokenizer, tokenizer);
+    tokenizer->cursor = RSTRING_PTR(source);
+    tokenizer->length = RSTRING_LEN(source);
+}
+
+void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token)
+{
+    if (tokenizer->length <= 0) {
+        memset(token, 0, sizeof(*token));
+        return;
+    }
+    token->type = TOKEN_STRING;
+
+    char *cursor = tokenizer->cursor;
+    char *last = tokenizer->cursor + tokenizer->length - 1;
+
+    while (cursor < last) {
+        if (*cursor++ != '{')
+            continue;
+
+        char c = *cursor++;
+        if (c != '%' && c != '{')
+            continue;
+        if (cursor - tokenizer->cursor > 2) {
+            token->type = TOKEN_STRING;
+            cursor -= 2;
+            goto found;
+        }
+        char *incomplete_end = cursor;
+        token->type = TOKEN_INVALID;
+        if (c == '%') {
+            while (cursor < last) {
+                if (*cursor++ != '%')
+                    continue;
+                c = *cursor++;
+                while (c == '%' && cursor <= last)
+                    c = *cursor++;
+                if (c != '}')
+                    continue;
+                token->type = TOKEN_TAG;
+                goto found;
+            }
+            // FIXME: Handle syntax error for strict mode
+            cursor = incomplete_end;
+            goto found;
+        } else {
+            while (cursor < last) {
+                if (*cursor++ != '}')
+                    continue;
+                if (*cursor++ != '}') {
+                    incomplete_end = cursor - 1;
+                    continue;
+                }
+                token->type = TOKEN_VARIABLE;
+                goto found;
+            }
+            // FIXME: Handle syntax error for strict mode
+            cursor = incomplete_end;
+            goto found;
+        }
+    }
+    cursor = last + 1;
+found:
+    token->str = tokenizer->cursor;
+    token->length = cursor - tokenizer->cursor;
+    tokenizer->cursor += token->length;
+    tokenizer->length -= token->length;
+}
+
+static VALUE rb_next(VALUE self)
+{
+    struct liquid_tokenizer *tokenizer;
+    Data_Get_Struct(self, struct liquid_tokenizer, tokenizer);
+
+    struct token token;
+    liquid_tokenizer_next(tokenizer, &token);
+    if (token.type == TOKEN_NONE)
+        return Qnil;
+
+    return rb_str_new(token.str, token.length);
+}
+
+void init_liquid_tokenizer()
+{
+    cLiquidTokenizer = rb_define_class_under(mLiquid, "Tokenizer", rb_cObject);
+    rb_define_alloc_func(cLiquidTokenizer, rb_allocate);
+    rb_define_method(cLiquidTokenizer, "initialize", rb_initialize, 1);
+    rb_define_method(cLiquidTokenizer, "next", rb_next, 0);
+    rb_define_alias(cLiquidTokenizer, "shift", "next");
+}
diff --git a/ext/liquid/tokenizer.h b/ext/liquid/tokenizer.h
new file mode 100644
index 0000000..83b7351
--- /dev/null
+++ b/ext/liquid/tokenizer.h
@@ -0,0 +1,26 @@
+#ifndef LIQUID_TOKENIZER_H
+#define LIQUID_TOKENIZER_H
+
+enum token_type {
+    TOKEN_NONE,
+    TOKEN_INVALID,
+    TOKEN_STRING,
+    TOKEN_TAG,
+    TOKEN_VARIABLE
+};
+
+struct token {
+    enum token_type type;
+    char *str;
+    int length;
+};
+
+struct liquid_tokenizer {
+    char *cursor;
+    int length;
+};
+
+void init_liquid_tokenizer();
+void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token);
+
+#endif
diff --git a/lib/liquid.rb b/lib/liquid.rb
index 6bcf43e..2f29b37 100644
--- a/lib/liquid.rb
+++ b/lib/liquid.rb
@@ -30,7 +30,6 @@ module Liquid
   VariableSegment             = /[\w\-]/
   VariableStart               = /\{\{/
   VariableEnd                 = /\}\}/
-  VariableIncompleteEnd       = /\}\}?/
   QuotedString                = /"[^"]*"|'[^']*'/
   QuotedFragment              = /#{QuotedString}|(?:[^\s,\|'"]|#{QuotedString})+/o
   StrictQuotedFragment        = /"[^"]+"|'[^']+'|[^\s|:,]+/
@@ -39,9 +38,6 @@ module Liquid
   SpacelessFilter             = /\A(?:'[^']+'|"[^"]+"|[^'"])*#{FilterSeparator}(?:#{StrictQuotedFragment})(?:#{FirstFilterArgument}(?:#{OtherFilterArgument})*)?/o
   Expression                  = /(?:#{QuotedFragment}(?:#{SpacelessFilter})*)/o
   TagAttributes               = /(\w+)\s*\:\s*(#{QuotedFragment})/o
-  AnyStartingTag              = /\{\{|\{\%/
-  PartialTemplateParser       = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/o
-  TemplateParser              = /(#{PartialTemplateParser}|#{AnyStartingTag})/o
   VariableParser              = /\[[^\]]+\]|#{VariableSegment}+\??/o
 end
 
diff --git a/lib/liquid/template.rb b/lib/liquid/template.rb
index 1f2bfd1..feb6ccf 100644
--- a/lib/liquid/template.rb
+++ b/lib/liquid/template.rb
@@ -162,16 +162,9 @@ module Liquid
 
     private
 
-    # Uses the <tt>Liquid::TemplateParser</tt> regexp to tokenize the passed source
     def tokenize(source)
       source = source.source if source.respond_to?(:source)
-      return [] if source.to_s.empty?
-      tokens = source.split(TemplateParser)
-
-      # removes the rogue empty element at the beginning of the array
-      tokens.shift if tokens[0] and tokens[0].empty?
-
-      tokens
+      Tokenizer.new(source.to_s)
     end
 
   end
diff --git a/test/liquid/template_test.rb b/test/liquid/template_test.rb
index f9afac1..abe2af0 100644
--- a/test/liquid/template_test.rb
+++ b/test/liquid/template_test.rb
@@ -25,26 +25,6 @@ end
 class TemplateTest < Test::Unit::TestCase
   include Liquid
 
-  def test_tokenize_strings
-    assert_equal [' '], Template.new.send(:tokenize, ' ')
-    assert_equal ['hello world'], Template.new.send(:tokenize, 'hello world')
-  end
-
-  def test_tokenize_variables
-    assert_equal ['{{funk}}'], Template.new.send(:tokenize, '{{funk}}')
-    assert_equal [' ', '{{funk}}', ' '], Template.new.send(:tokenize, ' {{funk}} ')
-    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], Template.new.send(:tokenize, ' {{funk}} {{so}} {{brother}} ')
-    assert_equal [' ', '{{  funk  }}', ' '], Template.new.send(:tokenize, ' {{  funk  }} ')
-  end
-
-  def test_tokenize_blocks
-    assert_equal ['{%comment%}'], Template.new.send(:tokenize, '{%comment%}')
-    assert_equal [' ', '{%comment%}', ' '], Template.new.send(:tokenize, ' {%comment%} ')
-
-    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], Template.new.send(:tokenize, ' {%comment%} {%endcomment%} ')
-    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], Template.new.send(:tokenize, "  {% comment %} {% endcomment %} ")
-  end
-
   def test_instance_assigns_persist_on_same_template_object_between_parses
     t = Template.new
     assert_equal 'from instance assigns', t.parse("{% assign foo = 'from instance assigns' %}{{ foo }}").render
diff --git a/test/liquid/tokenizer_test.rb b/test/liquid/tokenizer_test.rb
new file mode 100644
index 0000000..82b4116
--- /dev/null
+++ b/test/liquid/tokenizer_test.rb
@@ -0,0 +1,64 @@
+require 'test_helper'
+
+class TokenizerTest < Test::Unit::TestCase
+  def test_tokenize_strings
+    assert_equal [' '], tokenize(' ')
+    assert_equal ['hello world'], tokenize('hello world')
+  end
+
+  def test_tokenize_variables
+    assert_equal ['{{funk}}'], tokenize('{{funk}}')
+    assert_equal [' ', '{{funk}}', ' '], tokenize(' {{funk}} ')
+    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], tokenize(' {{funk}} {{so}} {{brother}} ')
+    assert_equal [' ', '{{  funk  }}', ' '], tokenize(' {{  funk  }} ')
+  end
+
+  def test_tokenize_blocks
+    assert_equal ['{%comment%}'], tokenize('{%comment%}')
+    assert_equal [' ', '{%comment%}', ' '], tokenize(' {%comment%} ')
+
+    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], tokenize(' {%comment%} {%endcomment%} ')
+    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], tokenize("  {% comment %} {% endcomment %} ")
+  end
+
+  def test_tokenize_incomplete_end
+    assert_tokens 'before{{ incomplete }after', ['before', '{{ incomplete }', 'after']
+    assert_tokens 'before{% incomplete %after', ['before', '{%', ' incomplete %after']
+  end
+
+  def test_tokenize_no_end
+    assert_tokens 'before{{ unterminated ', ['before', '{{', ' unterminated ']
+    assert_tokens 'before{% unterminated ', ['before', '{%', ' unterminated ']
+  end
+
+  private
+
+  def assert_tokens(source, expected)
+    assert_equal expected, tokenize(source)
+    assert_equal expected, old_tokenize(source)
+  end
+
+  def tokenize(source)
+    tokenizer = Liquid::Tokenizer.new(source)
+    tokens = []
+    while token = tokenizer.next
+      tokens << token
+    end
+    tokens
+  end
+
+  AnyStartingTag        = /\{\{|\{\%/
+  VariableIncompleteEnd = /\}\}?/
+  PartialTemplateParser = /#{Liquid::TagStart}.*?#{Liquid::TagEnd}|#{Liquid::VariableStart}.*?#{VariableIncompleteEnd}/o
+  TemplateParser        = /(#{PartialTemplateParser}|#{AnyStartingTag})/o
+
+  def old_tokenize(source)
+    return [] if source.to_s.empty?
+    tokens = source.split(TemplateParser)
+
+    # removes the rogue empty element at the beginning of the array
+    tokens.shift if tokens[0] and tokens[0].empty?
+
+    tokens
+  end
+end