Implement tokenization in a C extension.

2026-01-02 00:05:42 +03:00 · 2014-03-26 03:20:34 -04:00
13 changed files with 271 additions and 34 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,7 @@ pkg
 .rvmrc
 .ruby-version
 Gemfile.lock
 /ext/liquid/Makefile
 *.o
 *.bundle
 /tmp
--- a/8
+++ b/8
@@ -75,3 +75,11 @@ desc "Run example"
 task :example do
  ruby "-w -d -Ilib example/server/server.rb"
 end
 if defined?(RUBY_ENGINE) && RUBY_ENGINE == 'ruby'
  require 'rake/extensiontask'
  Rake::ExtensionTask.new "liquid" do |ext|
    ext.lib_dir = "lib/liquid"
  end
  Rake::Task[:test].prerequisites << :compile
 end
--- a/ext/liquid/extconf.rb
+++ b/ext/liquid/extconf.rb
@@ -0,0 +1,4 @@
 require 'mkmf'
 $CFLAGS << ' -Wall -Werror'
 $warnflags.gsub!(/-Wdeclaration-after-statement/, "")
 create_makefile("liquid/liquid")
--- a/ext/liquid/liquid.c
+++ b/ext/liquid/liquid.c
@@ -0,0 +1,9 @@
 #include "liquid.h"
 VALUE mLiquid;
 void Init_liquid(void)
 {
    mLiquid = rb_define_module("Liquid");
    init_liquid_tokenizer();
 }
--- a/ext/liquid/liquid.h
+++ b/ext/liquid/liquid.h
@@ -0,0 +1,11 @@
 #ifndef LIQUID_H
 #define LIQUID_H
 #include <ruby.h>
 #include <stdbool.h>
 #include "tokenizer.h"
 extern VALUE mLiquid;
 #endif
--- a/ext/liquid/tokenizer.c
+++ b/ext/liquid/tokenizer.c
@@ -0,0 +1,137 @@
 #include "liquid.h"
 VALUE cLiquidTokenizer;
 static void tokenizer_mark(void *ptr) {
    tokenizer_t *tokenizer = ptr;
    rb_gc_mark(tokenizer->source);
 }
 static void tokenizer_free(void *ptr)
 {
    tokenizer_t *tokenizer = ptr;
    xfree(tokenizer);
 }
 static size_t tokenizer_memsize(const void *ptr)
 {
    return ptr ? sizeof(tokenizer_t) : 0;
 }
 const rb_data_type_t tokenizer_data_type = {
    "liquid_tokenizer",
    {tokenizer_mark, tokenizer_free, tokenizer_memsize,},
 #ifdef RUBY_TYPED_FREE_IMMEDIATELY
    NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
 #endif
 };
 static VALUE tokenizer_allocate(VALUE klass)
 {
    VALUE obj;
    tokenizer_t *tokenizer;
    obj = TypedData_Make_Struct(klass, tokenizer_t, &tokenizer_data_type, tokenizer);
    tokenizer->source = Qnil;
    return obj;
 }
 static VALUE tokenizer_initialize_method(VALUE self, VALUE source)
 {
    tokenizer_t *tokenizer;
    Check_Type(source, T_STRING);
    Tokenizer_Get_Struct(self, tokenizer);
    source = rb_str_dup_frozen(source);
    tokenizer->source = source;
    tokenizer->cursor = RSTRING_PTR(source);
    tokenizer->length = RSTRING_LEN(source);
    return Qnil;
 }
 void tokenizer_next(tokenizer_t *tokenizer, token_t *token)
 {
    if (tokenizer->length <= 0) {
        memset(token, 0, sizeof(*token));
        return;
    }
    const char *cursor = tokenizer->cursor;
    const char *last = cursor + tokenizer->length - 1;
    token->str = cursor;
    token->type = TOKEN_STRING;
    while (cursor < last) {
        if (*cursor++ != '{')
            continue;
        char c = *cursor++;
        if (c != '%' && c != '{')
            continue;
        if (cursor - tokenizer->cursor > 2) {
            token->type = TOKEN_STRING;
            cursor -= 2;
            goto found;
        }
        token->type = TOKEN_INVALID;
        if (c == '%') {
            while (cursor < last) {
                if (*cursor++ != '%')
                    continue;
                c = *cursor++;
                while (c == '%' && cursor <= last)
                    c = *cursor++;
                if (c != '}')
                    continue;
                token->type = TOKEN_TAG;
                goto found;
            }
            // unterminated tag
            cursor = tokenizer->cursor + 2;
            goto found;
        } else {
            while (cursor < last) {
                if (*cursor++ != '}')
                    continue;
                if (*cursor++ != '}') {
                    // variable incomplete end, used to end raw tags
                    cursor--;
                    goto found;
                }
                token->type = TOKEN_VARIABLE;
                goto found;
            }
            // unterminated variable
            cursor = tokenizer->cursor + 2;
            goto found;
        }
    }
    cursor = last + 1;
 found:
    token->length = cursor - tokenizer->cursor;
    tokenizer->cursor += token->length;
    tokenizer->length -= token->length;
 }
 static VALUE tokenizer_next_method(VALUE self)
 {
    tokenizer_t *tokenizer;
    Tokenizer_Get_Struct(self, tokenizer);
    token_t token;
    tokenizer_next(tokenizer, &token);
    if (token.type == TOKEN_NONE)
        return Qnil;
    return rb_str_new(token.str, token.length);
 }
 void init_liquid_tokenizer()
 {
    cLiquidTokenizer = rb_define_class_under(mLiquid, "Tokenizer", rb_cObject);
    rb_define_alloc_func(cLiquidTokenizer, tokenizer_allocate);
    rb_define_method(cLiquidTokenizer, "initialize", tokenizer_initialize_method, 1);
    rb_define_method(cLiquidTokenizer, "next", tokenizer_next_method, 0);
    rb_define_alias(cLiquidTokenizer, "shift", "next");
 }
--- a/ext/liquid/tokenizer.h
+++ b/ext/liquid/tokenizer.h
@@ -0,0 +1,31 @@
 #ifndef LIQUID_TOKENIZER_H
 #define LIQUID_TOKENIZER_H
 enum token_type {
    TOKEN_NONE,
    TOKEN_INVALID,
    TOKEN_STRING,
    TOKEN_TAG,
    TOKEN_VARIABLE
 };
 typedef struct token {
    enum token_type type;
    const char *str;
    long length;
 } token_t;
 typedef struct tokenizer {
    VALUE source;
    const char *cursor;
    long length;
 } tokenizer_t;
 extern VALUE cLiquidTokenizer;
 extern const rb_data_type_t tokenizer_data_type;
 #define Tokenizer_Get_Struct(obj, sval) TypedData_Get_Struct(obj, tokenizer_t, &tokenizer_data_type, sval)
 void init_liquid_tokenizer();
 void tokenizer_next(tokenizer_t *tokenizer, token_t *token);
 #endif
--- a/lib/liquid.rb
+++ b/lib/liquid.rb
@@ -30,13 +30,9 @@ module Liquid
  VariableSegment             = /[\w\-]/
  VariableStart               = /\{\{/
  VariableEnd                 = /\}\}/
  VariableIncompleteEnd       = /\}\}?/
  QuotedString                = /"[^"]*"|'[^']*'/
  QuotedFragment              = /#{QuotedString}|(?:[^\s,\|'"]|#{QuotedString})+/o
  TagAttributes               = /(\w+)\s*\:\s*(#{QuotedFragment})/o
  AnyStartingTag              = /\{\{|\{\%/
  PartialTemplateParser       = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/om
  TemplateParser              = /(#{PartialTemplateParser}|#{AnyStartingTag})/om
  VariableParser              = /\[[^\]]+\]|#{VariableSegment}+\??/o
 end
@@ -64,3 +60,9 @@ require 'liquid/utils'
 # Load all the tags of the standard library
 #
 Dir[File.dirname(__FILE__) + '/liquid/tags/*.rb'].each { |f| require f }
 if defined?(RUBY_ENGINE) && RUBY_ENGINE == 'ruby'
  require 'liquid/liquid'
 else
  require 'liquid/tokenizer'
 end
--- a/lib/liquid/template.rb
+++ b/lib/liquid/template.rb
@@ -162,16 +162,9 @@ module Liquid
    private
    # Uses the <tt>Liquid::TemplateParser</tt> regexp to tokenize the passed source
    def tokenize(source)
      source = source.source if source.respond_to?(:source)
-      return [] if source.to_s.empty?
+      Tokenizer.new(source.to_s)
      tokens = source.split(TemplateParser)
      # removes the rogue empty element at the beginning of the array
      tokens.shift if tokens[0] and tokens[0].empty?
      tokens
    end
  end
--- a/lib/liquid/tokenizer.rb
+++ b/lib/liquid/tokenizer.rb
@@ -0,0 +1,20 @@
 module Liquid
  class Tokenizer
    VariableIncompleteEnd = /\}\}?/
    AnyStartingTag        = /\{\{|\{\%/
    PartialTemplateParser = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/om
    TemplateParser        = /(#{PartialTemplateParser}|#{AnyStartingTag})/om
    def initialize(source)
      @tokens = source.split(TemplateParser)
      # removes the rogue empty element at the beginning of the array
      @tokens.shift if @tokens[0] && @tokens[0].empty?
    end
    def next
      @tokens.shift
    end
    alias_method :shift, :next
  end
 end
--- a/liquid.gemspec
+++ b/liquid.gemspec
@@ -18,13 +18,17 @@ Gem::Specification.new do |s|
  s.required_rubygems_version = ">= 1.3.7"
  s.test_files  = Dir.glob("{test}/**/*")
-  s.files       = Dir.glob("{lib}/**/*") + %w(MIT-LICENSE README.md)
+  s.files       = Dir.glob("{lib,ext}/**/*") + %w(MIT-LICENSE README.md)
  s.extra_rdoc_files  = ["History.md", "README.md"]
  s.require_path = "lib"
  s.add_development_dependency 'stackprof' if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("2.1.0")
  s.add_development_dependency 'rake'
  s.add_development_dependency 'activesupport'
  if defined?(RUBY_ENGINE) && RUBY_ENGINE == 'ruby'
    s.extensions  = ['ext/liquid/extconf.rb']
    s.add_development_dependency 'rake-compiler'
    s.add_development_dependency 'stackprof' if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("2.1.0")
  end
 end
--- a/test/liquid/template_test.rb
+++ b/test/liquid/template_test.rb
@@ -25,26 +25,6 @@ end
 class TemplateTest < Test::Unit::TestCase
  include Liquid
  def test_tokenize_strings
    assert_equal [' '], Template.new.send(:tokenize, ' ')
    assert_equal ['hello world'], Template.new.send(:tokenize, 'hello world')
  end
  def test_tokenize_variables
    assert_equal ['{{funk}}'], Template.new.send(:tokenize, '{{funk}}')
    assert_equal [' ', '{{funk}}', ' '], Template.new.send(:tokenize, ' {{funk}} ')
    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], Template.new.send(:tokenize, ' {{funk}} {{so}} {{brother}} ')
    assert_equal [' ', '{{  funk  }}', ' '], Template.new.send(:tokenize, ' {{  funk  }} ')
  end
  def test_tokenize_blocks
    assert_equal ['{%comment%}'], Template.new.send(:tokenize, '{%comment%}')
    assert_equal [' ', '{%comment%}', ' '], Template.new.send(:tokenize, ' {%comment%} ')
    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], Template.new.send(:tokenize, ' {%comment%} {%endcomment%} ')
    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], Template.new.send(:tokenize, "  {% comment %} {% endcomment %} ")
  end
  def test_instance_assigns_persist_on_same_template_object_between_parses
    t = Template.new
    assert_equal 'from instance assigns', t.parse("{% assign foo = 'from instance assigns' %}{{ foo }}").render!
--- a/test/liquid/tokenizer_test.rb
+++ b/test/liquid/tokenizer_test.rb
@@ -0,0 +1,34 @@
 require 'test_helper'
 class TokenizerTest < Test::Unit::TestCase
  def test_tokenize_strings
    assert_equal [' '], tokenize(' ')
    assert_equal ['hello world'], tokenize('hello world')
  end
  def test_tokenize_variables
    assert_equal ['{{funk}}'], tokenize('{{funk}}')
    assert_equal [' ', '{{funk}}', ' '], tokenize(' {{funk}} ')
    assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], tokenize(' {{funk}} {{so}} {{brother}} ')
    assert_equal [' ', '{{  funk  }}', ' '], tokenize(' {{  funk  }} ')
  end
  def test_tokenize_blocks
    assert_equal ['{%comment%}'], tokenize('{%comment%}')
    assert_equal [' ', '{%comment%}', ' '], tokenize(' {%comment%} ')
    assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], tokenize(' {%comment%} {%endcomment%} ')
    assert_equal ['  ', '{% comment %}', ' ', '{% endcomment %}', ' '], tokenize("  {% comment %} {% endcomment %} ")
  end
  private
  def tokenize(source)
    tokenizer = Liquid::Tokenizer.new(source)
    tokens = []
    while token = tokenizer.next
      tokens << token
    end
    tokens
  end
 end