Create a Liquid::Tokenizer class in the C extension.

2026-01-06 10:15:40 +03:00 · 2014-02-27 15:19:27 -05:00
parent 8bb3bca64a
commit ea8406e36e
9 changed files with 220 additions and 34 deletions
--- a/1
+++ b/1
@@ -84,3 +84,4 @@ end
 Rake::ExtensionTask.new "liquid" do |ext|
  ext.lib_dir = "lib/liquid"
 end
+Rake::Task[:test].prerequisites << :compile
--- a/ext/liquid/liquid_ext.c
+++ b/ext/liquid/liquid_ext.c
@@ -1,6 +1,9 @@
-#include "ruby.h"
+#include "liquid_ext.h"
+
+VALUE mLiquid;

 void Init_liquid(void)
 {
-    VALUE mLiquid = rb_define_module("Liquid");
+    mLiquid = rb_define_module("Liquid");
+    init_liquid_tokenizer();
 }
--- a/ext/liquid/liquid_ext.h
+++ b/ext/liquid/liquid_ext.h
@@ -0,0 +1,8 @@
+#ifndef LIQUID_EXT_H
+#define LIQUID_EXT_H
+
+#include <ruby.h>
+#include <stdbool.h>
+#include "tokenizer.h"
+
+#endif
--- a/ext/liquid/tokenizer.c
+++ b/ext/liquid/tokenizer.c
@@ -0,0 +1,115 @@
+#include "liquid_ext.h"
+
+VALUE cLiquidTokenizer;
+extern VALUE mLiquid;
+
+static void free_tokenizer(void *ptr)
+{
+    struct liquid_tokenizer *tokenizer;
+    xfree(tokenizer);
+}
+
+static VALUE rb_allocate(VALUE klass)
+{
+    VALUE obj;
+    struct liquid_tokenizer *tokenizer;
+
+    obj = Data_Make_Struct(klass, struct liquid_tokenizer, NULL, free_tokenizer, tokenizer);
+    return obj;
+}
+
+static VALUE rb_initialize(VALUE self, VALUE source)
+{
+    struct liquid_tokenizer *tokenizer;
+
+    Check_Type(source, T_STRING);
+    Data_Get_Struct(self, struct liquid_tokenizer, tokenizer);
+    tokenizer->cursor = RSTRING_PTR(source);
+    tokenizer->length = RSTRING_LEN(source);
+}
+
+void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token)
+{
+    if (tokenizer->length <= 0) {
+        memset(token, 0, sizeof(*token));
+        return;
+    }
+    token->type = TOKEN_STRING;
+
+    char *cursor = tokenizer->cursor;
+    char *last = tokenizer->cursor + tokenizer->length - 1;
+
+    while (cursor < last) {
+        if (*cursor++ != '{')
+            continue;
+
+        char c = *cursor++;
+        if (c != '%' && c != '{')
+            continue;
+        if (cursor - tokenizer->cursor > 2) {
+            token->type = TOKEN_STRING;
+            cursor -= 2;
+            goto found;
+        }
+        char *incomplete_end = cursor;
+        token->type = TOKEN_INVALID;
+        if (c == '%') {
+            while (cursor < last) {
+                if (*cursor++ != '%')
+                    continue;
+                c = *cursor++;
+                while (c == '%' && cursor <= last)
+                    c = *cursor++;
+                if (c != '}')
+                    continue;
+                token->type = TOKEN_TAG;
+                goto found;
+            }
+            // FIXME: Handle syntax error for strict mode
+            cursor = incomplete_end;
+            goto found;
+        } else {
+            while (cursor < last) {
+                if (*cursor++ != '}')
+                    continue;
+                if (*cursor++ != '}') {
+                    incomplete_end = cursor - 1;
+                    continue;
+                }
+                token->type = TOKEN_VARIABLE;
+                goto found;
+            }
+            // FIXME: Handle syntax error for strict mode
+            cursor = incomplete_end;
+            goto found;
+        }
+    }
+    cursor = last + 1;
+found:
+    token->str = tokenizer->cursor;
+    token->length = cursor - tokenizer->cursor;
+    tokenizer->cursor += token->length;
+    tokenizer->length -= token->length;
+}
+
+static VALUE rb_next(VALUE self)
+{
+    struct liquid_tokenizer *tokenizer;
+    Data_Get_Struct(self, struct liquid_tokenizer, tokenizer);
+
+    struct token token;
+    liquid_tokenizer_next(tokenizer, &token);
+    if (token.type == TOKEN_NONE)
+        return Qnil;
+
+    return rb_str_new(token.str, token.length);
+}
+
+void init_liquid_tokenizer()
+{
+    cLiquidTokenizer = rb_define_class_under(mLiquid, "Tokenizer", rb_cObject);
+    rb_define_alloc_func(cLiquidTokenizer, rb_allocate);
+    rb_define_method(cLiquidTokenizer, "initialize", rb_initialize, 1);
+    rb_define_method(cLiquidTokenizer, "next", rb_next, 0);
+    rb_define_alias(cLiquidTokenizer, "shift", "next");
+}
--- a/ext/liquid/tokenizer.h
+++ b/ext/liquid/tokenizer.h
@@ -0,0 +1,26 @@
+#ifndef LIQUID_TOKENIZER_H
+#define LIQUID_TOKENIZER_H
+
+enum token_type {
+    TOKEN_NONE,
+    TOKEN_INVALID,
+    TOKEN_STRING,
+    TOKEN_TAG,
+    TOKEN_VARIABLE
+};
+
+struct token {
+    enum token_type type;
+    char *str;
+    int length;
+};
+
+struct liquid_tokenizer {
+    char *cursor;
+    int length;
+};
+
+void init_liquid_tokenizer();
+void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token);
+
+#endif
--- a/lib/liquid.rb
+++ b/lib/liquid.rb
@@ -30,7 +30,6 @@ module Liquid
  VariableSegment             = /[\w\-]/
  VariableStart               = /\{\{/
  VariableEnd                 = /\}\}/
-  VariableIncompleteEnd       = /\}\}?/
  QuotedString                = /"[^"]*"|'[^']*'/
  QuotedFragment              = /#{QuotedString}|(?:[^\s,\|'"]|#{QuotedString})+/o
  StrictQuotedFragment        = /"[^"]+"|'[^']+'|[^\s|:,]+/
@@ -39,9 +38,6 @@ module Liquid
  SpacelessFilter             = /\A(?:'[^']+'|"[^"]+"|[^'"])*#{FilterSeparator}(?:#{StrictQuotedFragment})(?:#{FirstFilterArgument}(?:#{OtherFilterArgument})*)?/o
  Expression                  = /(?:#{QuotedFragment}(?:#{SpacelessFilter})*)/o
  TagAttributes               = /(\w+)\s*\:\s*(#{QuotedFragment})/o
-  AnyStartingTag              = /\{\{|\{\%/
-  PartialTemplateParser       = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/o
-  TemplateParser              = /(#{PartialTemplateParser}|#{AnyStartingTag})/o
  VariableParser              = /\[[^\]]+\]|#{VariableSegment}+\??/o
 end

--- a/lib/liquid/template.rb
+++ b/lib/liquid/template.rb
@@ -162,16 +162,9 @@ module Liquid

    private

-    # Uses the <tt>Liquid::TemplateParser</tt> regexp to tokenize the passed source
    def tokenize(source)
      source = source.source if source.respond_to?(:source)
-      return [] if source.to_s.empty?
-      tokens = source.split(TemplateParser)
-
-      # removes the rogue empty element at the beginning of the array
-      tokens.shift if tokens[0] and tokens[0].empty?
-
-      tokens
+      Tokenizer.new(source.to_s)
    end

  end
--- a/test/liquid/template_test.rb
+++ b/test/liquid/template_test.rb
@@ -25,26 +25,6 @@ end
 class TemplateTest < Test::Unit::TestCase
  include Liquid

-  def test_tokenize_strings
-    assert_equal [' '], Template.new.send(:tokenize, ' ')
-    assert_equal ['hello world'], Template.new.send(:tokenize, 'hello world')
-  end
-
-  def test_tokenize_variables
-    assert_equal ['{{funk}}'], Template.new.send(:tokenize, '{{funk}}')
-    assert_equal [' ', '{{funk}}', ' '], Template.new.send(:tokenize, ' {{funk}} ')
-    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], Template.new.send(:tokenize, ' {{funk}} {{so}} {{brother}} ')
-    assert_equal [' ', '{{  funk  }}', ' '], Template.new.send(:tokenize, ' {{  funk  }} ')
-  end
-
-  def test_tokenize_blocks
-    assert_equal ['{%comment%}'], Template.new.send(:tokenize, '{%comment%}')
-    assert_equal [' ', '{%comment%}', ' '], Template.new.send(:tokenize, ' {%comment%} ')
-
-    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], Template.new.send(:tokenize, ' {%comment%} {%endcomment%} ')
-    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], Template.new.send(:tokenize, "  {% comment %} {% endcomment %} ")
-  end
-
  def test_instance_assigns_persist_on_same_template_object_between_parses
    t = Template.new
    assert_equal 'from instance assigns', t.parse("{% assign foo = 'from instance assigns' %}{{ foo }}").render
--- a/test/liquid/tokenizer_test.rb
+++ b/test/liquid/tokenizer_test.rb
@@ -0,0 +1,64 @@
+require 'test_helper'
+
+class TokenizerTest < Test::Unit::TestCase
+  def test_tokenize_strings
+    assert_equal [' '], tokenize(' ')
+    assert_equal ['hello world'], tokenize('hello world')
+  end
+
+  def test_tokenize_variables
+    assert_equal ['{{funk}}'], tokenize('{{funk}}')
+    assert_equal [' ', '{{funk}}', ' '], tokenize(' {{funk}} ')
+    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], tokenize(' {{funk}} {{so}} {{brother}} ')
+    assert_equal [' ', '{{  funk  }}', ' '], tokenize(' {{  funk  }} ')
+  end
+
+  def test_tokenize_blocks
+    assert_equal ['{%comment%}'], tokenize('{%comment%}')
+    assert_equal [' ', '{%comment%}', ' '], tokenize(' {%comment%} ')
+
+    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], tokenize(' {%comment%} {%endcomment%} ')
+    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], tokenize("  {% comment %} {% endcomment %} ")
+  end
+
+  def test_tokenize_incomplete_end
+    assert_tokens 'before{{ incomplete }after', ['before', '{{ incomplete }', 'after']
+    assert_tokens 'before{% incomplete %after', ['before', '{%', ' incomplete %after']
+  end
+
+  def test_tokenize_no_end
+    assert_tokens 'before{{ unterminated ', ['before', '{{', ' unterminated ']
+    assert_tokens 'before{% unterminated ', ['before', '{%', ' unterminated ']
+  end
+
+  private
+
+  def assert_tokens(source, expected)
+    assert_equal expected, tokenize(source)
+    assert_equal expected, old_tokenize(source)
+  end
+
+  def tokenize(source)
+    tokenizer = Liquid::Tokenizer.new(source)
+    tokens = []
+    while token = tokenizer.next
+      tokens << token
+    end
+    tokens
+  end
+
+  AnyStartingTag        = /\{\{|\{\%/
+  VariableIncompleteEnd = /\}\}?/
+  PartialTemplateParser = /#{Liquid::TagStart}.*?#{Liquid::TagEnd}|#{Liquid::VariableStart}.*?#{VariableIncompleteEnd}/o
+  TemplateParser        = /(#{PartialTemplateParser}|#{AnyStartingTag})/o
+
+  def old_tokenize(source)
+    return [] if source.to_s.empty?
+    tokens = source.split(TemplateParser)
+
+    # removes the rogue empty element at the beginning of the array
+    tokens.shift if tokens[0] and tokens[0].empty?
+
+    tokens
+  end
+end