Create a Liquid::Tokenizer class in the C extension.

This commit is contained in:
Dylan Thacker-Smith
2014-02-27 15:19:27 -05:00
parent 8bb3bca64a
commit ea8406e36e
9 changed files with 220 additions and 34 deletions

View File

@@ -84,3 +84,4 @@ end
Rake::ExtensionTask.new "liquid" do |ext|
ext.lib_dir = "lib/liquid"
end
Rake::Task[:test].prerequisites << :compile

View File

@@ -1,6 +1,9 @@
#include "ruby.h"
#include "liquid_ext.h"
VALUE mLiquid;
void Init_liquid(void)
{
VALUE mLiquid = rb_define_module("Liquid");
mLiquid = rb_define_module("Liquid");
init_liquid_tokenizer();
}

8
ext/liquid/liquid_ext.h Normal file
View File

@@ -0,0 +1,8 @@
#ifndef LIQUID_EXT_H
#define LIQUID_EXT_H
#include <ruby.h>
#include <stdbool.h>
#include "tokenizer.h"
#endif

115
ext/liquid/tokenizer.c Normal file
View File

@@ -0,0 +1,115 @@
#include "liquid_ext.h"
VALUE cLiquidTokenizer;
extern VALUE mLiquid;
static void free_tokenizer(void *ptr)
{
struct liquid_tokenizer *tokenizer;
xfree(tokenizer);
}
static VALUE rb_allocate(VALUE klass)
{
VALUE obj;
struct liquid_tokenizer *tokenizer;
obj = Data_Make_Struct(klass, struct liquid_tokenizer, NULL, free_tokenizer, tokenizer);
return obj;
}
static VALUE rb_initialize(VALUE self, VALUE source)
{
struct liquid_tokenizer *tokenizer;
Check_Type(source, T_STRING);
Data_Get_Struct(self, struct liquid_tokenizer, tokenizer);
tokenizer->cursor = RSTRING_PTR(source);
tokenizer->length = RSTRING_LEN(source);
}
void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token)
{
if (tokenizer->length <= 0) {
memset(token, 0, sizeof(*token));
return;
}
token->type = TOKEN_STRING;
char *cursor = tokenizer->cursor;
char *last = tokenizer->cursor + tokenizer->length - 1;
while (cursor < last) {
if (*cursor++ != '{')
continue;
char c = *cursor++;
if (c != '%' && c != '{')
continue;
if (cursor - tokenizer->cursor > 2) {
token->type = TOKEN_STRING;
cursor -= 2;
goto found;
}
char *incomplete_end = cursor;
token->type = TOKEN_INVALID;
if (c == '%') {
while (cursor < last) {
if (*cursor++ != '%')
continue;
c = *cursor++;
while (c == '%' && cursor <= last)
c = *cursor++;
if (c != '}')
continue;
token->type = TOKEN_TAG;
goto found;
}
// FIXME: Handle syntax error for strict mode
cursor = incomplete_end;
goto found;
} else {
while (cursor < last) {
if (*cursor++ != '}')
continue;
if (*cursor++ != '}') {
incomplete_end = cursor - 1;
continue;
}
token->type = TOKEN_VARIABLE;
goto found;
}
// FIXME: Handle syntax error for strict mode
cursor = incomplete_end;
goto found;
}
}
cursor = last + 1;
found:
token->str = tokenizer->cursor;
token->length = cursor - tokenizer->cursor;
tokenizer->cursor += token->length;
tokenizer->length -= token->length;
}
static VALUE rb_next(VALUE self)
{
struct liquid_tokenizer *tokenizer;
Data_Get_Struct(self, struct liquid_tokenizer, tokenizer);
struct token token;
liquid_tokenizer_next(tokenizer, &token);
if (token.type == TOKEN_NONE)
return Qnil;
return rb_str_new(token.str, token.length);
}
void init_liquid_tokenizer()
{
cLiquidTokenizer = rb_define_class_under(mLiquid, "Tokenizer", rb_cObject);
rb_define_alloc_func(cLiquidTokenizer, rb_allocate);
rb_define_method(cLiquidTokenizer, "initialize", rb_initialize, 1);
rb_define_method(cLiquidTokenizer, "next", rb_next, 0);
rb_define_alias(cLiquidTokenizer, "shift", "next");
}

26
ext/liquid/tokenizer.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef LIQUID_TOKENIZER_H
#define LIQUID_TOKENIZER_H
enum token_type {
TOKEN_NONE,
TOKEN_INVALID,
TOKEN_STRING,
TOKEN_TAG,
TOKEN_VARIABLE
};
struct token {
enum token_type type;
char *str;
int length;
};
struct liquid_tokenizer {
char *cursor;
int length;
};
void init_liquid_tokenizer();
void liquid_tokenizer_next(struct liquid_tokenizer *tokenizer, struct token *token);
#endif

View File

@@ -30,7 +30,6 @@ module Liquid
VariableSegment = /[\w\-]/
VariableStart = /\{\{/
VariableEnd = /\}\}/
VariableIncompleteEnd = /\}\}?/
QuotedString = /"[^"]*"|'[^']*'/
QuotedFragment = /#{QuotedString}|(?:[^\s,\|'"]|#{QuotedString})+/o
StrictQuotedFragment = /"[^"]+"|'[^']+'|[^\s|:,]+/
@@ -39,9 +38,6 @@ module Liquid
SpacelessFilter = /\A(?:'[^']+'|"[^"]+"|[^'"])*#{FilterSeparator}(?:#{StrictQuotedFragment})(?:#{FirstFilterArgument}(?:#{OtherFilterArgument})*)?/o
Expression = /(?:#{QuotedFragment}(?:#{SpacelessFilter})*)/o
TagAttributes = /(\w+)\s*\:\s*(#{QuotedFragment})/o
AnyStartingTag = /\{\{|\{\%/
PartialTemplateParser = /#{TagStart}.*?#{TagEnd}|#{VariableStart}.*?#{VariableIncompleteEnd}/o
TemplateParser = /(#{PartialTemplateParser}|#{AnyStartingTag})/o
VariableParser = /\[[^\]]+\]|#{VariableSegment}+\??/o
end

View File

@@ -162,16 +162,9 @@ module Liquid
private
# Uses the <tt>Liquid::TemplateParser</tt> regexp to tokenize the passed source
def tokenize(source)
source = source.source if source.respond_to?(:source)
return [] if source.to_s.empty?
tokens = source.split(TemplateParser)
# removes the rogue empty element at the beginning of the array
tokens.shift if tokens[0] and tokens[0].empty?
tokens
Tokenizer.new(source.to_s)
end
end

View File

@@ -25,26 +25,6 @@ end
class TemplateTest < Test::Unit::TestCase
include Liquid
def test_tokenize_strings
assert_equal [' '], Template.new.send(:tokenize, ' ')
assert_equal ['hello world'], Template.new.send(:tokenize, 'hello world')
end
def test_tokenize_variables
assert_equal ['{{funk}}'], Template.new.send(:tokenize, '{{funk}}')
assert_equal [' ', '{{funk}}', ' '], Template.new.send(:tokenize, ' {{funk}} ')
assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], Template.new.send(:tokenize, ' {{funk}} {{so}} {{brother}} ')
assert_equal [' ', '{{ funk }}', ' '], Template.new.send(:tokenize, ' {{ funk }} ')
end
def test_tokenize_blocks
assert_equal ['{%comment%}'], Template.new.send(:tokenize, '{%comment%}')
assert_equal [' ', '{%comment%}', ' '], Template.new.send(:tokenize, ' {%comment%} ')
assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], Template.new.send(:tokenize, ' {%comment%} {%endcomment%} ')
assert_equal [' ', '{% comment %}', ' ', '{% endcomment %}', ' '], Template.new.send(:tokenize, " {% comment %} {% endcomment %} ")
end
def test_instance_assigns_persist_on_same_template_object_between_parses
t = Template.new
assert_equal 'from instance assigns', t.parse("{% assign foo = 'from instance assigns' %}{{ foo }}").render

View File

@@ -0,0 +1,64 @@
require 'test_helper'
class TokenizerTest < Test::Unit::TestCase
def test_tokenize_strings
assert_equal [' '], tokenize(' ')
assert_equal ['hello world'], tokenize('hello world')
end
def test_tokenize_variables
assert_equal ['{{funk}}'], tokenize('{{funk}}')
assert_equal [' ', '{{funk}}', ' '], tokenize(' {{funk}} ')
assert_equal [' ', '{{funk}}', ' ', '{{so}}', ' ', '{{brother}}', ' '], tokenize(' {{funk}} {{so}} {{brother}} ')
assert_equal [' ', '{{ funk }}', ' '], tokenize(' {{ funk }} ')
end
def test_tokenize_blocks
assert_equal ['{%comment%}'], tokenize('{%comment%}')
assert_equal [' ', '{%comment%}', ' '], tokenize(' {%comment%} ')
assert_equal [' ', '{%comment%}', ' ', '{%endcomment%}', ' '], tokenize(' {%comment%} {%endcomment%} ')
assert_equal [' ', '{% comment %}', ' ', '{% endcomment %}', ' '], tokenize(" {% comment %} {% endcomment %} ")
end
def test_tokenize_incomplete_end
assert_tokens 'before{{ incomplete }after', ['before', '{{ incomplete }', 'after']
assert_tokens 'before{% incomplete %after', ['before', '{%', ' incomplete %after']
end
def test_tokenize_no_end
assert_tokens 'before{{ unterminated ', ['before', '{{', ' unterminated ']
assert_tokens 'before{% unterminated ', ['before', '{%', ' unterminated ']
end
private
def assert_tokens(source, expected)
assert_equal expected, tokenize(source)
assert_equal expected, old_tokenize(source)
end
def tokenize(source)
tokenizer = Liquid::Tokenizer.new(source)
tokens = []
while token = tokenizer.next
tokens << token
end
tokens
end
AnyStartingTag = /\{\{|\{\%/
VariableIncompleteEnd = /\}\}?/
PartialTemplateParser = /#{Liquid::TagStart}.*?#{Liquid::TagEnd}|#{Liquid::VariableStart}.*?#{VariableIncompleteEnd}/o
TemplateParser = /(#{PartialTemplateParser}|#{AnyStartingTag})/o
def old_tokenize(source)
return [] if source.to_s.empty?
tokens = source.split(TemplateParser)
# removes the rogue empty element at the beginning of the array
tokens.shift if tokens[0] and tokens[0].empty?
tokens
end
end