From ea8406e36e98a78819129566b0b1414bbc4f800c Mon Sep 17 00:00:00 2001 From: Dylan Thacker-Smith Date: Thu, 27 Feb 2014 15:19:27 -0500 Subject: [PATCH] Create a Liquid::Tokenizer class in the C extension. --- Rakefile | 1 + ext/liquid/liquid_ext.c | 7 ++- ext/liquid/liquid_ext.h | 8 +++ ext/liquid/tokenizer.c | 115 ++++++++++++++++++++++++++++++++++ ext/liquid/tokenizer.h | 26 ++++++++ lib/liquid.rb | 4 -- lib/liquid/template.rb | 9 +-- test/liquid/template_test.rb | 20 ------ test/liquid/tokenizer_test.rb | 64 +++++++++++++++++++ 9 files changed, 220 insertions(+), 34 deletions(-) create mode 100644 ext/liquid/liquid_ext.h create mode 100644 ext/liquid/tokenizer.c create mode 100644 ext/liquid/tokenizer.h create mode 100644 test/liquid/tokenizer_test.rb diff --git a/Rakefile b/Rakefile index b7f8f30..9371c5a 100755 --- a/Rakefile +++ b/Rakefile @@ -84,3 +84,4 @@ end Rake::ExtensionTask.new "liquid" do |ext| ext.lib_dir = "lib/liquid" end +Rake::Task[:test].prerequisites << :compile diff --git a/ext/liquid/liquid_ext.c b/ext/liquid/liquid_ext.c index 4d7a47d..5dab635 100644 --- a/ext/liquid/liquid_ext.c +++ b/ext/liquid/liquid_ext.c @@ -1,6 +1,9 @@ -#include "ruby.h" +#include "liquid_ext.h" + +VALUE mLiquid; void Init_liquid(void) { - VALUE mLiquid = rb_define_module("Liquid"); + mLiquid = rb_define_module("Liquid"); + init_liquid_tokenizer(); } diff --git a/ext/liquid/liquid_ext.h b/ext/liquid/liquid_ext.h new file mode 100644 index 0000000..065a90f --- /dev/null +++ b/ext/liquid/liquid_ext.h @@ -0,0 +1,8 @@ +#ifndef LIQUID_EXT_H +#define LIQUID_EXT_H + +#include +#include +#include "tokenizer.h" + +#endif diff --git a/ext/liquid/tokenizer.c b/ext/liquid/tokenizer.c new file mode 100644 index 0000000..096306f --- /dev/null +++ b/ext/liquid/tokenizer.c @@ -0,0 +1,115 @@ +#include "liquid_ext.h" + +VALUE cLiquidTokenizer; +extern VALUE mLiquid; + +static void free_tokenizer(void *ptr) +{ + struct liquid_tokenizer *tokenizer; + xfree(tokenizer); +} + +static VALUE rb_allocate(VALUE klass) +{ + VALUE obj; + struct liquid_tokenizer *tokenizer; + + obj = Data_Make_Struct(klass, struct liquid_tokenizer, NULL, free_tokenizer, tokenizer); + return obj; +} + +static VALUE rb_initialize(VALUE self, VALUE source) +{ + struct liquid_tokenizer *tokenizer; + + Check_Type(source, T_STRING); + Data_Get_Struct(self, struct liquid_tokenizer, tokenizer); + tokenizer->cursor = RSTRING_PTR(source); + tokenizer->length = RSTRING_LEN(source); +} + +void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token) +{ + if (tokenizer->length <= 0) { + memset(token, 0, sizeof(*token)); + return; + } + token->type = TOKEN_STRING; + + char *cursor = tokenizer->cursor; + char *last = tokenizer->cursor + tokenizer->length - 1; + + while (cursor < last) { + if (*cursor++ != '{') + continue; + + char c = *cursor++; + if (c != '%' && c != '{') + continue; + if (cursor - tokenizer->cursor > 2) { + token->type = TOKEN_STRING; + cursor -= 2; + goto found; + } + char *incomplete_end = cursor; + token->type = TOKEN_INVALID; + if (c == '%') { + while (cursor < last) { + if (*cursor++ != '%') + continue; + c = *cursor++; + while (c == '%' && cursor <= last) + c = *cursor++; + if (c != '}') + continue; + token->type = TOKEN_TAG; + goto found; + } + // FIXME: Handle syntax error for strict mode + cursor = incomplete_end; + goto found; + } else { + while (cursor < last) { + if (*cursor++ != '}') + continue; + if (*cursor++ != '}') { + incomplete_end = cursor - 1; + continue; + } + token->type = TOKEN_VARIABLE; + goto found; + } + // FIXME: Handle syntax error for strict mode + cursor = incomplete_end; + goto found; + } + } + cursor = last + 1; +found: + token->str = tokenizer->cursor; + token->length = cursor - tokenizer->cursor; + tokenizer->cursor += token->length; + tokenizer->length -= token->length; +} + +static VALUE rb_next(VALUE self) +{ + struct liquid_tokenizer *tokenizer; + Data_Get_Struct(self, struct liquid_tokenizer, tokenizer); + + struct token token; + liquid_tokenizer_next(tokenizer, &token); + if (token.type == TOKEN_NONE) + return Qnil; + + return rb_str_new(token.str, token.length); +} + +void init_liquid_tokenizer() +{ + cLiquidTokenizer = rb_define_class_under(mLiquid, "Tokenizer", rb_cObject); + rb_define_alloc_func(cLiquidTokenizer, rb_allocate); + rb_define_method(cLiquidTokenizer, "initialize", rb_initialize, 1); + rb_define_method(cLiquidTokenizer, "next", rb_next, 0); + rb_define_alias(cLiquidTokenizer, "shift", "next"); +} diff --git a/ext/liquid/tokenizer.h b/ext/liquid/tokenizer.h new file mode 100644 index 0000000..83b7351 --- /dev/null +++ b/ext/liquid/tokenizer.h @@ -0,0 +1,26 @@ +#ifndef LIQUID_TOKENIZER_H +#define LIQUID_TOKENIZER_H + +enum token_type { + TOKEN_NONE, + TOKEN_INVALID, + TOKEN_STRING, + TOKEN_TAG, + TOKEN_VARIABLE +}; + +struct token { + enum token_type type; + char *str; + int length; +}; + +struct liquid_tokenizer { + char *cursor; + int length; +}; + +void init_liquid_tokenizer(); +void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token); + +#endif diff --git a/lib/liquid.rb b/lib/liquid.rb index 6bcf43e..2f29b37 100644 --- a/lib/liquid.rb +++ b/lib/liquid.rb @@ -30,7 +30,6 @@ module Liquid VariableSegment = /[\w\-]/ VariableStart = /\{\{/ VariableEnd = /\}\}/ - VariableIncompleteEnd = /\}\}?/ QuotedString = /"[^"]*"|'[^']*'/ QuotedFragment = /#{QuotedString}|(?:[^\s,\|'"]|#{QuotedString})+/o StrictQuotedFragment = /"[^"]+"|'[^']+'|[^\s|:,]+/ @@ -39,9 +38,6 @@ module Liquid SpacelessFilter = /\A(?:'[^']+'|"[^"]+"|[^'"])*#{FilterSeparator}(?:#{StrictQuotedFragment})(?:#{FirstFilterArgument}(?:#{OtherFilterArgument})*)?/o Expression = /(?:#{QuotedFragment}(?:#{SpacelessFilter})*)/o TagAttributes = /(\w+)\s*\:\s*(#{QuotedFragment})/o - AnyStartingTag = /\{\{|\{\%/ - PartialTemplateParser = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/o - TemplateParser = /(#{PartialTemplateParser}|#{AnyStartingTag})/o VariableParser = /\[[^\]]+\]|#{VariableSegment}+\??/o end diff --git a/lib/liquid/template.rb b/lib/liquid/template.rb index 1f2bfd1..feb6ccf 100644 --- a/lib/liquid/template.rb +++ b/lib/liquid/template.rb @@ -162,16 +162,9 @@ module Liquid private - # Uses the Liquid::TemplateParser regexp to tokenize the passed source def tokenize(source) source = source.source if source.respond_to?(:source) - return [] if source.to_s.empty? - tokens = source.split(TemplateParser) - - # removes the rogue empty element at the beginning of the array - tokens.shift if tokens[0] and tokens[0].empty? - - tokens + Tokenizer.new(source.to_s) end end diff --git a/test/liquid/template_test.rb b/test/liquid/template_test.rb index f9afac1..abe2af0 100644 --- a/test/liquid/template_test.rb +++ b/test/liquid/template_test.rb @@ -25,26 +25,6 @@ end class TemplateTest < Test::Unit::TestCase include Liquid - def test_tokenize_strings - assert_equal [' '], Template.new.send(:tokenize, ' ') - assert_equal ['hello world'], Template.new.send(:tokenize, 'hello world') - end - - def test_tokenize_variables - assert_equal ['{{funk}}'], Template.new.send(:tokenize, '{{funk}}') - assert_equal [' ', '{{funk}}', ' '], Template.new.send(:tokenize, ' {{funk}} ') - assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], Template.new.send(:tokenize, ' {{funk}} {{so}} {{brother}} ') - assert_equal [' ', '{{ funk }}', ' '], Template.new.send(:tokenize, ' {{ funk }} ') - end - - def test_tokenize_blocks - assert_equal ['{%comment%}'], Template.new.send(:tokenize, '{%comment%}') - assert_equal [' ', '{%comment%}', ' '], Template.new.send(:tokenize, ' {%comment%} ') - - assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], Template.new.send(:tokenize, ' {%comment%} {%endcomment%} ') - assert_equal [' ', '{% comment %}', ' ', '{% endcomment %}', ' '], Template.new.send(:tokenize, " {% comment %} {% endcomment %} ") - end - def test_instance_assigns_persist_on_same_template_object_between_parses t = Template.new assert_equal 'from instance assigns', t.parse("{% assign foo = 'from instance assigns' %}{{ foo }}").render diff --git a/test/liquid/tokenizer_test.rb b/test/liquid/tokenizer_test.rb new file mode 100644 index 0000000..82b4116 --- /dev/null +++ b/test/liquid/tokenizer_test.rb @@ -0,0 +1,64 @@ +require 'test_helper' + +class TokenizerTest < Test::Unit::TestCase + def test_tokenize_strings + assert_equal [' '], tokenize(' ') + assert_equal ['hello world'], tokenize('hello world') + end + + def test_tokenize_variables + assert_equal ['{{funk}}'], tokenize('{{funk}}') + assert_equal [' ', '{{funk}}', ' '], tokenize(' {{funk}} ') + assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], tokenize(' {{funk}} {{so}} {{brother}} ') + assert_equal [' ', '{{ funk }}', ' '], tokenize(' {{ funk }} ') + end + + def test_tokenize_blocks + assert_equal ['{%comment%}'], tokenize('{%comment%}') + assert_equal [' ', '{%comment%}', ' '], tokenize(' {%comment%} ') + + assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], tokenize(' {%comment%} {%endcomment%} ') + assert_equal [' ', '{% comment %}', ' ', '{% endcomment %}', ' '], tokenize(" {% comment %} {% endcomment %} ") + end + + def test_tokenize_incomplete_end + assert_tokens 'before{{ incomplete }after', ['before', '{{ incomplete }', 'after'] + assert_tokens 'before{% incomplete %after', ['before', '{%', ' incomplete %after'] + end + + def test_tokenize_no_end + assert_tokens 'before{{ unterminated ', ['before', '{{', ' unterminated '] + assert_tokens 'before{% unterminated ', ['before', '{%', ' unterminated '] + end + + private + + def assert_tokens(source, expected) + assert_equal expected, tokenize(source) + assert_equal expected, old_tokenize(source) + end + + def tokenize(source) + tokenizer = Liquid::Tokenizer.new(source) + tokens = [] + while token = tokenizer.next + tokens << token + end + tokens + end + + AnyStartingTag = /\{\{|\{\%/ + VariableIncompleteEnd = /\}\}?/ + PartialTemplateParser = /#{Liquid::TagStart}.*?#{Liquid::TagEnd}|#{Liquid::VariableStart}.*?#{VariableIncompleteEnd}/o + TemplateParser = /(#{PartialTemplateParser}|#{AnyStartingTag})/o + + def old_tokenize(source) + return [] if source.to_s.empty? + tokens = source.split(TemplateParser) + + # removes the rogue empty element at the beginning of the array + tokens.shift if tokens[0] and tokens[0].empty? + + tokens + end +end