Skip to content

Commit

Permalink
Switched lexer to use util::next_char_from()
Browse files Browse the repository at this point in the history
  • Loading branch information
netvl committed Jun 27, 2015
1 parent c7ff126 commit 9881790
Showing 1 changed file with 10 additions and 28 deletions.
38 changes: 10 additions & 28 deletions src/reader/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
use std::mem;
use std::fmt;
use std::io::prelude::*;
use std::str;
use std::result;

use common::{Error, Position, TextPosition, is_whitespace_char, is_name_char};
use util;

/// `Token` represents a single lexeme of an XML document. These lexemes
/// are used to perform actual parsing.
Expand Down Expand Up @@ -219,8 +219,7 @@ pub struct Lexer {
st: State,
skip_errors: bool,
inside_token: bool,
eof_handled: bool,
buffer: Vec<u8>
eof_handled: bool
}

impl Position for Lexer {
Expand All @@ -239,8 +238,7 @@ impl Lexer {
st: State::Normal,
skip_errors: false,
inside_token: false,
eof_handled: false,
buffer: Vec::with_capacity(4)
eof_handled: false
}
}

Expand Down Expand Up @@ -273,6 +271,7 @@ impl Lexer {
self.inside_token = true;
}

// TODO: replace with a queue of chars
// Check if we have saved a char for ourselves
if let Some(c) = mem::replace(&mut self.temp_char, None) {
match self.read_next_token(c) {
Expand All @@ -284,32 +283,15 @@ impl Lexer {
}
}

// Reading characters
self.buffer.clear();
loop {
// Read a byte in order to read an utf-8 code point
// TODO: read error is ignored here
if let Some(byte) = b.bytes().next().and_then(|i| i.ok()) {
self.buffer.push(byte);
} else {
// Nothing to read left in the reader
break;
}

// Try to get one unicode code point
// As we added only a byte, we can get at most a utf-8 string with
// a single code point.
let cp = match str::from_utf8(&self.buffer) {
Ok(s) => s.chars().next().unwrap(), // the string contains at least one code point
Err(_) => {
// continue until we get a valid cp
continue;
}
// TODO: this should handle multiple encodings
let c = match util::next_char_from(b) {
Ok(Some(c)) => c, // got next char
Ok(None) => break, // nothing to read left
Err(_) => break // FIXME: errors should be handled properly
};
// string was read, discard the buffer
self.buffer.clear();

match self.read_next_token(cp) {
match self.read_next_token(c) {
Some(t) => {
self.inside_token = false;
return Some(t);
Expand Down

0 comments on commit 9881790

Please sign in to comment.