From 87472e73b6d36c6650f6a34154d35eabd856699a Mon Sep 17 00:00:00 2001 From: Dylan Thacker-Smith Date: Thu, 27 Feb 2014 09:53:17 -0500 Subject: [PATCH] Implement tokenization in a C extension. --- .gitignore | 2 + Rakefile | 6 ++ ext/liquid/extconf.rb | 3 + ext/liquid/liquid_ext.c | 9 +++ ext/liquid/liquid_ext.h | 8 +++ ext/liquid/tokenizer.c | 114 ++++++++++++++++++++++++++++++++++ ext/liquid/tokenizer.h | 26 ++++++++ lib/liquid.rb | 5 +- lib/liquid/template.rb | 9 +-- liquid.gemspec | 8 ++- test/liquid/template_test.rb | 20 ------ test/liquid/tokenizer_test.rb | 64 +++++++++++++++++++ 12 files changed, 240 insertions(+), 34 deletions(-) create mode 100644 ext/liquid/extconf.rb create mode 100644 ext/liquid/liquid_ext.c create mode 100644 ext/liquid/liquid_ext.h create mode 100644 ext/liquid/tokenizer.c create mode 100644 ext/liquid/tokenizer.h create mode 100644 test/liquid/tokenizer_test.rb diff --git a/.gitignore b/.gitignore index 0e27775..df96f4e 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ pkg .rvmrc .ruby-version Gemfile.lock +*.bundle +/tmp diff --git a/Rakefile b/Rakefile index cd69268..5615265 100755 --- a/Rakefile +++ b/Rakefile @@ -1,5 +1,6 @@ require 'rake' require 'rake/testtask' +require 'rake/extensiontask' $LOAD_PATH.unshift File.expand_path("../lib", __FILE__) require "liquid/version" @@ -75,3 +76,8 @@ desc "Run example" task :example do ruby "-w -d -Ilib example/server/server.rb" end + +Rake::ExtensionTask.new "liquid" do |ext| + ext.lib_dir = "lib/liquid" +end +Rake::Task[:test].prerequisites << :compile diff --git a/ext/liquid/extconf.rb b/ext/liquid/extconf.rb new file mode 100644 index 0000000..824777e --- /dev/null +++ b/ext/liquid/extconf.rb @@ -0,0 +1,3 @@ +require 'mkmf' +$CFLAGS << ' -Wall' +create_makefile("liquid/liquid") diff --git a/ext/liquid/liquid_ext.c b/ext/liquid/liquid_ext.c new file mode 100644 index 0000000..5dab635 --- /dev/null +++ b/ext/liquid/liquid_ext.c @@ -0,0 +1,9 @@ +#include "liquid_ext.h" + +VALUE mLiquid; + +void Init_liquid(void) +{ + mLiquid = rb_define_module("Liquid"); + init_liquid_tokenizer(); +} diff --git a/ext/liquid/liquid_ext.h b/ext/liquid/liquid_ext.h new file mode 100644 index 0000000..065a90f --- /dev/null +++ b/ext/liquid/liquid_ext.h @@ -0,0 +1,8 @@ +#ifndef LIQUID_EXT_H +#define LIQUID_EXT_H + +#include +#include +#include "tokenizer.h" + +#endif diff --git a/ext/liquid/tokenizer.c b/ext/liquid/tokenizer.c new file mode 100644 index 0000000..26e2368 --- /dev/null +++ b/ext/liquid/tokenizer.c @@ -0,0 +1,114 @@ +#include "liquid_ext.h" + +VALUE cLiquidTokenizer; +extern VALUE mLiquid; + +static void free_tokenizer(void *ptr) +{ + struct liquid_tokenizer *tokenizer = ptr; + xfree(tokenizer); +} + +static VALUE rb_allocate(VALUE klass) +{ + VALUE obj; + struct liquid_tokenizer *tokenizer; + + obj = Data_Make_Struct(klass, struct liquid_tokenizer, NULL, free_tokenizer, tokenizer); + return obj; +} + +static VALUE rb_initialize(VALUE self, VALUE source) +{ + struct liquid_tokenizer *tokenizer; + + Check_Type(source, T_STRING); + Data_Get_Struct(self, struct liquid_tokenizer, tokenizer); + tokenizer->cursor = RSTRING_PTR(source); + tokenizer->length = RSTRING_LEN(source); + return Qnil; +} + +void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token) +{ + if (tokenizer->length <= 0) { + memset(token, 0, sizeof(*token)); + return; + } + token->type = TOKEN_STRING; + + char *cursor = tokenizer->cursor; + char *last = tokenizer->cursor + tokenizer->length - 1; + + while (cursor < last) { + if (*cursor++ != '{') + continue; + + char c = *cursor++; + if (c != '%' && c != '{') + continue; + if (cursor - tokenizer->cursor > 2) { + token->type = TOKEN_STRING; + cursor -= 2; + goto found; + } + char *incomplete_end = cursor; + token->type = TOKEN_INVALID; + if (c == '%') { + while (cursor < last) { + if (*cursor++ != '%') + continue; + c = *cursor++; + while (c == '%' && cursor <= last) + c = *cursor++; + if (c != '}') + continue; + token->type = TOKEN_TAG; + goto found; + } + cursor = incomplete_end; + goto found; + } else { + while (cursor < last) { + if (*cursor++ != '}') + continue; + if (*cursor++ != '}') { + incomplete_end = cursor - 1; + continue; + } + token->type = TOKEN_VARIABLE; + goto found; + } + cursor = incomplete_end; + goto found; + } + } + cursor = last + 1; +found: + token->str = tokenizer->cursor; + token->length = cursor - tokenizer->cursor; + tokenizer->cursor += token->length; + tokenizer->length -= token->length; +} + +static VALUE rb_next(VALUE self) +{ + struct liquid_tokenizer *tokenizer; + Data_Get_Struct(self, struct liquid_tokenizer, tokenizer); + + struct token token; + liquid_tokenizer_next(tokenizer, &token); + if (token.type == TOKEN_NONE) + return Qnil; + + return rb_str_new(token.str, token.length); +} + +void init_liquid_tokenizer() +{ + cLiquidTokenizer = rb_define_class_under(mLiquid, "Tokenizer", rb_cObject); + rb_define_alloc_func(cLiquidTokenizer, rb_allocate); + rb_define_method(cLiquidTokenizer, "initialize", rb_initialize, 1); + rb_define_method(cLiquidTokenizer, "next", rb_next, 0); + rb_define_alias(cLiquidTokenizer, "shift", "next"); +} diff --git a/ext/liquid/tokenizer.h b/ext/liquid/tokenizer.h new file mode 100644 index 0000000..83b7351 --- /dev/null +++ b/ext/liquid/tokenizer.h @@ -0,0 +1,26 @@ +#ifndef LIQUID_TOKENIZER_H +#define LIQUID_TOKENIZER_H + +enum token_type { + TOKEN_NONE, + TOKEN_INVALID, + TOKEN_STRING, + TOKEN_TAG, + TOKEN_VARIABLE +}; + +struct token { + enum token_type type; + char *str; + int length; +}; + +struct liquid_tokenizer { + char *cursor; + int length; +}; + +void init_liquid_tokenizer(); +void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token); + +#endif diff --git a/lib/liquid.rb b/lib/liquid.rb index 484f8b6..71f4e37 100644 --- a/lib/liquid.rb +++ b/lib/liquid.rb @@ -30,16 +30,13 @@ module Liquid VariableSegment = /[\w\-]/ VariableStart = /\{\{/ VariableEnd = /\}\}/ - VariableIncompleteEnd = /\}\}?/ QuotedString = /"[^"]*"|'[^']*'/ QuotedFragment = /#{QuotedString}|(?:[^\s,\|'"]|#{QuotedString})+/o TagAttributes = /(\w+)\s*\:\s*(#{QuotedFragment})/o - AnyStartingTag = /\{\{|\{\%/ - PartialTemplateParser = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/om - TemplateParser = /(#{PartialTemplateParser}|#{AnyStartingTag})/om VariableParser = /\[[^\]]+\]|#{VariableSegment}+\??/o end +require 'liquid/liquid' require "liquid/version" require 'liquid/lexer' require 'liquid/parser' diff --git a/lib/liquid/template.rb b/lib/liquid/template.rb index 13748d4..fc97b4a 100644 --- a/lib/liquid/template.rb +++ b/lib/liquid/template.rb @@ -162,16 +162,9 @@ module Liquid private - # Uses the Liquid::TemplateParser regexp to tokenize the passed source def tokenize(source) source = source.source if source.respond_to?(:source) - return [] if source.to_s.empty? - tokens = source.split(TemplateParser) - - # removes the rogue empty element at the beginning of the array - tokens.shift if tokens[0] and tokens[0].empty? - - tokens + Tokenizer.new(source.to_s) end end diff --git a/liquid.gemspec b/liquid.gemspec index 297a716..14d23ce 100644 --- a/liquid.gemspec +++ b/liquid.gemspec @@ -18,13 +18,17 @@ Gem::Specification.new do |s| s.required_rubygems_version = ">= 1.3.7" s.test_files = Dir.glob("{test}/**/*") - s.files = Dir.glob("{lib}/**/*") + %w(MIT-LICENSE README.md) + s.files = Dir.glob("{lib,ext}/**/*") + %w(MIT-LICENSE README.md) + s.extensions = ['ext/liquid/extconf.rb'] s.extra_rdoc_files = ["History.md", "README.md"] s.require_path = "lib" - s.add_development_dependency 'stackprof' if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("2.1.0") s.add_development_dependency 'rake' s.add_development_dependency 'activesupport' + if RUBY_ENGINE == 'ruby' + s.add_development_dependency 'rake-compiler' + s.add_development_dependency 'stackprof' if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("2.1.0") + end end diff --git a/test/liquid/template_test.rb b/test/liquid/template_test.rb index f58f8b8..1baea9f 100644 --- a/test/liquid/template_test.rb +++ b/test/liquid/template_test.rb @@ -25,26 +25,6 @@ end class TemplateTest < Test::Unit::TestCase include Liquid - def test_tokenize_strings - assert_equal [' '], Template.new.send(:tokenize, ' ') - assert_equal ['hello world'], Template.new.send(:tokenize, 'hello world') - end - - def test_tokenize_variables - assert_equal ['{{funk}}'], Template.new.send(:tokenize, '{{funk}}') - assert_equal [' ', '{{funk}}', ' '], Template.new.send(:tokenize, ' {{funk}} ') - assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], Template.new.send(:tokenize, ' {{funk}} {{so}} {{brother}} ') - assert_equal [' ', '{{ funk }}', ' '], Template.new.send(:tokenize, ' {{ funk }} ') - end - - def test_tokenize_blocks - assert_equal ['{%comment%}'], Template.new.send(:tokenize, '{%comment%}') - assert_equal [' ', '{%comment%}', ' '], Template.new.send(:tokenize, ' {%comment%} ') - - assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], Template.new.send(:tokenize, ' {%comment%} {%endcomment%} ') - assert_equal [' ', '{% comment %}', ' ', '{% endcomment %}', ' '], Template.new.send(:tokenize, " {% comment %} {% endcomment %} ") - end - def test_instance_assigns_persist_on_same_template_object_between_parses t = Template.new assert_equal 'from instance assigns', t.parse("{% assign foo = 'from instance assigns' %}{{ foo }}").render! diff --git a/test/liquid/tokenizer_test.rb b/test/liquid/tokenizer_test.rb new file mode 100644 index 0000000..82b4116 --- /dev/null +++ b/test/liquid/tokenizer_test.rb @@ -0,0 +1,64 @@ +require 'test_helper' + +class TokenizerTest < Test::Unit::TestCase + def test_tokenize_strings + assert_equal [' '], tokenize(' ') + assert_equal ['hello world'], tokenize('hello world') + end + + def test_tokenize_variables + assert_equal ['{{funk}}'], tokenize('{{funk}}') + assert_equal [' ', '{{funk}}', ' '], tokenize(' {{funk}} ') + assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], tokenize(' {{funk}} {{so}} {{brother}} ') + assert_equal [' ', '{{ funk }}', ' '], tokenize(' {{ funk }} ') + end + + def test_tokenize_blocks + assert_equal ['{%comment%}'], tokenize('{%comment%}') + assert_equal [' ', '{%comment%}', ' '], tokenize(' {%comment%} ') + + assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], tokenize(' {%comment%} {%endcomment%} ') + assert_equal [' ', '{% comment %}', ' ', '{% endcomment %}', ' '], tokenize(" {% comment %} {% endcomment %} ") + end + + def test_tokenize_incomplete_end + assert_tokens 'before{{ incomplete }after', ['before', '{{ incomplete }', 'after'] + assert_tokens 'before{% incomplete %after', ['before', '{%', ' incomplete %after'] + end + + def test_tokenize_no_end + assert_tokens 'before{{ unterminated ', ['before', '{{', ' unterminated '] + assert_tokens 'before{% unterminated ', ['before', '{%', ' unterminated '] + end + + private + + def assert_tokens(source, expected) + assert_equal expected, tokenize(source) + assert_equal expected, old_tokenize(source) + end + + def tokenize(source) + tokenizer = Liquid::Tokenizer.new(source) + tokens = [] + while token = tokenizer.next + tokens << token + end + tokens + end + + AnyStartingTag = /\{\{|\{\%/ + VariableIncompleteEnd = /\}\}?/ + PartialTemplateParser = /#{Liquid::TagStart}.*?#{Liquid::TagEnd}|#{Liquid::VariableStart}.*?#{VariableIncompleteEnd}/o + TemplateParser = /(#{PartialTemplateParser}|#{AnyStartingTag})/o + + def old_tokenize(source) + return [] if source.to_s.empty? + tokens = source.split(TemplateParser) + + # removes the rogue empty element at the beginning of the array + tokens.shift if tokens[0] and tokens[0].empty? + + tokens + end +end