Implement tokenization in a C extension.

2026-01-02 00:05:42 +03:00 · 2014-03-26 03:20:34 -04:00
13 changed files with 271 additions and 34 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,7 @@ pkg
 .rvmrc
 .ruby-version
 Gemfile.lock
+/ext/liquid/Makefile
+*.o
+*.bundle
+/tmp
--- a/8
+++ b/8
@@ -75,3 +75,11 @@ desc "Run example"
 task :example do
  ruby "-w -d -Ilib example/server/server.rb"
 end
+
+if defined?(RUBY_ENGINE) && RUBY_ENGINE == 'ruby'
+  require 'rake/extensiontask'
+  Rake::ExtensionTask.new "liquid" do |ext|
+    ext.lib_dir = "lib/liquid"
+  end
+  Rake::Task[:test].prerequisites << :compile
+end
--- a/ext/liquid/extconf.rb
+++ b/ext/liquid/extconf.rb
@@ -0,0 +1,4 @@
+require 'mkmf'
+$CFLAGS << ' -Wall -Werror'
+$warnflags.gsub!(/-Wdeclaration-after-statement/, "")
+create_makefile("liquid/liquid")
--- a/ext/liquid/liquid.c
+++ b/ext/liquid/liquid.c
@@ -0,0 +1,9 @@
+#include "liquid.h"
+
+VALUE mLiquid;
+
+void Init_liquid(void)
+{
+    mLiquid = rb_define_module("Liquid");
+    init_liquid_tokenizer();
+}
--- a/ext/liquid/liquid.h
+++ b/ext/liquid/liquid.h
@@ -0,0 +1,11 @@
+#ifndef LIQUID_H
+#define LIQUID_H
+
+#include <ruby.h>
+#include <stdbool.h>
+
+#include "tokenizer.h"
+
+extern VALUE mLiquid;
+
+#endif
--- a/ext/liquid/tokenizer.c
+++ b/ext/liquid/tokenizer.c
@@ -0,0 +1,137 @@
+#include "liquid.h"
+
+VALUE cLiquidTokenizer;
+
+static void tokenizer_mark(void *ptr) {
+    tokenizer_t *tokenizer = ptr;
+    rb_gc_mark(tokenizer->source);
+}
+
+static void tokenizer_free(void *ptr)
+{
+    tokenizer_t *tokenizer = ptr;
+    xfree(tokenizer);
+}
+
+static size_t tokenizer_memsize(const void *ptr)
+{
+    return ptr ? sizeof(tokenizer_t) : 0;
+}
+
+const rb_data_type_t tokenizer_data_type = {
+    "liquid_tokenizer",
+    {tokenizer_mark, tokenizer_free, tokenizer_memsize,},
+#ifdef RUBY_TYPED_FREE_IMMEDIATELY
+    NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
+#endif
+};
+
+static VALUE tokenizer_allocate(VALUE klass)
+{
+    VALUE obj;
+    tokenizer_t *tokenizer;
+
+    obj = TypedData_Make_Struct(klass, tokenizer_t, &tokenizer_data_type, tokenizer);
+    tokenizer->source = Qnil;
+    return obj;
+}
+
+static VALUE tokenizer_initialize_method(VALUE self, VALUE source)
+{
+    tokenizer_t *tokenizer;
+
+    Check_Type(source, T_STRING);
+    Tokenizer_Get_Struct(self, tokenizer);
+    source = rb_str_dup_frozen(source);
+    tokenizer->source = source;
+    tokenizer->cursor = RSTRING_PTR(source);
+    tokenizer->length = RSTRING_LEN(source);
+    return Qnil;
+}
+
+void tokenizer_next(tokenizer_t *tokenizer, token_t *token)
+{
+    if (tokenizer->length <= 0) {
+        memset(token, 0, sizeof(*token));
+        return;
+    }
+
+    const char *cursor = tokenizer->cursor;
+    const char *last = cursor + tokenizer->length - 1;
+
+    token->str = cursor;
+    token->type = TOKEN_STRING;
+
+    while (cursor < last) {
+        if (*cursor++ != '{')
+            continue;
+
+        char c = *cursor++;
+        if (c != '%' && c != '{')
+            continue;
+        if (cursor - tokenizer->cursor > 2) {
+            token->type = TOKEN_STRING;
+            cursor -= 2;
+            goto found;
+        }
+        token->type = TOKEN_INVALID;
+        if (c == '%') {
+            while (cursor < last) {
+                if (*cursor++ != '%')
+                    continue;
+                c = *cursor++;
+                while (c == '%' && cursor <= last)
+                    c = *cursor++;
+                if (c != '}')
+                    continue;
+                token->type = TOKEN_TAG;
+                goto found;
+            }
+            // unterminated tag
+            cursor = tokenizer->cursor + 2;
+            goto found;
+        } else {
+            while (cursor < last) {
+                if (*cursor++ != '}')
+                    continue;
+                if (*cursor++ != '}') {
+                    // variable incomplete end, used to end raw tags
+                    cursor--;
+                    goto found;
+                }
+                token->type = TOKEN_VARIABLE;
+                goto found;
+            }
+            // unterminated variable
+            cursor = tokenizer->cursor + 2;
+            goto found;
+        }
+    }
+    cursor = last + 1;
+found:
+    token->length = cursor - tokenizer->cursor;
+    tokenizer->cursor += token->length;
+    tokenizer->length -= token->length;
+}
+
+static VALUE tokenizer_next_method(VALUE self)
+{
+    tokenizer_t *tokenizer;
+    Tokenizer_Get_Struct(self, tokenizer);
+
+    token_t token;
+    tokenizer_next(tokenizer, &token);
+    if (token.type == TOKEN_NONE)
+        return Qnil;
+
+    return rb_str_new(token.str, token.length);
+}
+
+void init_liquid_tokenizer()
+{
+    cLiquidTokenizer = rb_define_class_under(mLiquid, "Tokenizer", rb_cObject);
+    rb_define_alloc_func(cLiquidTokenizer, tokenizer_allocate);
+    rb_define_method(cLiquidTokenizer, "initialize", tokenizer_initialize_method, 1);
+    rb_define_method(cLiquidTokenizer, "next", tokenizer_next_method, 0);
+    rb_define_alias(cLiquidTokenizer, "shift", "next");
+}
--- a/ext/liquid/tokenizer.h
+++ b/ext/liquid/tokenizer.h
@@ -0,0 +1,31 @@
+#ifndef LIQUID_TOKENIZER_H
+#define LIQUID_TOKENIZER_H
+
+enum token_type {
+    TOKEN_NONE,
+    TOKEN_INVALID,
+    TOKEN_STRING,
+    TOKEN_TAG,
+    TOKEN_VARIABLE
+};
+
+typedef struct token {
+    enum token_type type;
+    const char *str;
+    long length;
+} token_t;
+
+typedef struct tokenizer {
+    VALUE source;
+    const char *cursor;
+    long length;
+} tokenizer_t;
+
+extern VALUE cLiquidTokenizer;
+extern const rb_data_type_t tokenizer_data_type;
+#define Tokenizer_Get_Struct(obj, sval) TypedData_Get_Struct(obj, tokenizer_t, &tokenizer_data_type, sval)
+
+void init_liquid_tokenizer();
+void tokenizer_next(tokenizer_t *tokenizer, token_t *token);
+
+#endif
--- a/lib/liquid.rb
+++ b/lib/liquid.rb
@@ -30,13 +30,9 @@ module Liquid
  VariableSegment             = /[\w\-]/
  VariableStart               = /\{\{/
  VariableEnd                 = /\}\}/
-  VariableIncompleteEnd       = /\}\}?/
  QuotedString                = /"[^"]*"|'[^']*'/
  QuotedFragment              = /#{QuotedString}|(?:[^\s,\|'"]|#{QuotedString})+/o
  TagAttributes               = /(\w+)\s*\:\s*(#{QuotedFragment})/o
-  AnyStartingTag              = /\{\{|\{\%/
-  PartialTemplateParser       = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/om
-  TemplateParser              = /(#{PartialTemplateParser}|#{AnyStartingTag})/om
  VariableParser              = /\[[^\]]+\]|#{VariableSegment}+\??/o
 end

@@ -64,3 +60,9 @@ require 'liquid/utils'
 # Load all the tags of the standard library
 #
 Dir[File.dirname(__FILE__) + '/liquid/tags/*.rb'].each { |f| require f }
+
+if defined?(RUBY_ENGINE) && RUBY_ENGINE == 'ruby'
+  require 'liquid/liquid'
+else
+  require 'liquid/tokenizer'
+end
--- a/lib/liquid/template.rb
+++ b/lib/liquid/template.rb
@@ -162,16 +162,9 @@ module Liquid

    private

-    # Uses the <tt>Liquid::TemplateParser</tt> regexp to tokenize the passed source
    def tokenize(source)
      source = source.source if source.respond_to?(:source)
-      return [] if source.to_s.empty?
-      tokens = source.split(TemplateParser)
-
-      # removes the rogue empty element at the beginning of the array
-      tokens.shift if tokens[0] and tokens[0].empty?
-
-      tokens
+      Tokenizer.new(source.to_s)
    end

  end
--- a/lib/liquid/tokenizer.rb
+++ b/lib/liquid/tokenizer.rb
@@ -0,0 +1,20 @@
+module Liquid
+  class Tokenizer
+    VariableIncompleteEnd = /\}\}?/
+    AnyStartingTag        = /\{\{|\{\%/
+    PartialTemplateParser = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/om
+    TemplateParser        = /(#{PartialTemplateParser}|#{AnyStartingTag})/om
+
+    def initialize(source)
+      @tokens = source.split(TemplateParser)
+
+      # removes the rogue empty element at the beginning of the array
+      @tokens.shift if @tokens[0] && @tokens[0].empty?
+    end
+
+    def next
+      @tokens.shift
+    end
+    alias_method :shift, :next
+  end
+end
--- a/liquid.gemspec
+++ b/liquid.gemspec
@@ -18,13 +18,17 @@ Gem::Specification.new do |s|
  s.required_rubygems_version = ">= 1.3.7"

  s.test_files  = Dir.glob("{test}/**/*")
-  s.files       = Dir.glob("{lib}/**/*") + %w(MIT-LICENSE README.md)
+  s.files       = Dir.glob("{lib,ext}/**/*") + %w(MIT-LICENSE README.md)

  s.extra_rdoc_files  = ["History.md", "README.md"]

  s.require_path = "lib"

-  s.add_development_dependency 'stackprof' if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("2.1.0")
  s.add_development_dependency 'rake'
  s.add_development_dependency 'activesupport'
+  if defined?(RUBY_ENGINE) && RUBY_ENGINE == 'ruby'
+    s.extensions  = ['ext/liquid/extconf.rb']
+    s.add_development_dependency 'rake-compiler'
+    s.add_development_dependency 'stackprof' if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("2.1.0")
+  end
 end
--- a/test/liquid/template_test.rb
+++ b/test/liquid/template_test.rb
@@ -25,26 +25,6 @@ end
 class TemplateTest < Test::Unit::TestCase
  include Liquid

-  def test_tokenize_strings
-    assert_equal [' '], Template.new.send(:tokenize, ' ')
-    assert_equal ['hello world'], Template.new.send(:tokenize, 'hello world')
-  end
-
-  def test_tokenize_variables
-    assert_equal ['{{funk}}'], Template.new.send(:tokenize, '{{funk}}')
-    assert_equal [' ', '{{funk}}', ' '], Template.new.send(:tokenize, ' {{funk}} ')
-    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], Template.new.send(:tokenize, ' {{funk}} {{so}} {{brother}} ')
-    assert_equal [' ', '{{  funk  }}', ' '], Template.new.send(:tokenize, ' {{  funk  }} ')
-  end
-
-  def test_tokenize_blocks
-    assert_equal ['{%comment%}'], Template.new.send(:tokenize, '{%comment%}')
-    assert_equal [' ', '{%comment%}', ' '], Template.new.send(:tokenize, ' {%comment%} ')
-
-    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], Template.new.send(:tokenize, ' {%comment%} {%endcomment%} ')
-    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], Template.new.send(:tokenize, "  {% comment %} {% endcomment %} ")
-  end
-
  def test_instance_assigns_persist_on_same_template_object_between_parses
    t = Template.new
    assert_equal 'from instance assigns', t.parse("{% assign foo = 'from instance assigns' %}{{ foo }}").render!
--- a/test/liquid/tokenizer_test.rb
+++ b/test/liquid/tokenizer_test.rb
@@ -0,0 +1,34 @@
+require 'test_helper'
+
+class TokenizerTest < Test::Unit::TestCase
+  def test_tokenize_strings
+    assert_equal [' '], tokenize(' ')
+    assert_equal ['hello world'], tokenize('hello world')
+  end
+
+  def test_tokenize_variables
+    assert_equal ['{{funk}}'], tokenize('{{funk}}')
+    assert_equal [' ', '{{funk}}', ' '], tokenize(' {{funk}} ')
+    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], tokenize(' {{funk}} {{so}} {{brother}} ')
+    assert_equal [' ', '{{  funk  }}', ' '], tokenize(' {{  funk  }} ')
+  end
+
+  def test_tokenize_blocks
+    assert_equal ['{%comment%}'], tokenize('{%comment%}')
+    assert_equal [' ', '{%comment%}', ' '], tokenize(' {%comment%} ')
+
+    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], tokenize(' {%comment%} {%endcomment%} ')
+    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], tokenize("  {% comment %} {% endcomment %} ")
+  end
+
+  private
+
+  def tokenize(source)
+    tokenizer = Liquid::Tokenizer.new(source)
+    tokens = []
+    while token = tokenizer.next
+      tokens << token
+    end
+    tokens
+  end
+end