From d86df7d7da689caf9e498664101db3095af5a32f Mon Sep 17 00:00:00 2001 From: v-jkegler Date: Tue, 19 Nov 2024 13:39:01 -0500 Subject: [PATCH] Comments on lowest_match_inner() (#56) Co-authored-by: Jeffrey Kegler --- parser/src/earley/regexvec.rs | 36 ++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/parser/src/earley/regexvec.rs b/parser/src/earley/regexvec.rs index 92b2fde..031989f 100644 --- a/parser/src/earley/regexvec.rs +++ b/parser/src/earley/regexvec.rs @@ -201,36 +201,66 @@ impl RegexVec { } // Find the lowest, or best, match in 'state'. It is the first lazy regex. - // If there is no lazy regex, and all greedy lexemes have reached end of - // input (EOI), then it is the first greedy lexeme. If neither of these + // If there is no lazy regex, and all greedy lexemes have reached the end of + // the lexeme, then it is the first greedy lexeme. If neither of these // criteria produce a choice for "best", 'None' is returned. fn lowest_match_inner(&mut self, state: StateID) -> Option<(usize, usize)> { + // 'all_eoi' is true if all greedy lexemes match, that is, if we are at + // the end of lexeme for all of them. End of lexeme is called + // "end of input" or EOI for consistency with the regex package. + // Initially, 'all_eoi' is true, vacuously. let mut all_eoi = true; + + // 'eoi_candidate' tracks the lowest (aka first or best) greedy match. + // Initially, there is none. let mut eoi_candidate = None; + + // For every regex in this state for (idx, e) in iter_state(&self.rx_sets, state) { + + // If this lexeme is not a match. (If the derivative at this point is nullable, + // there is a match, so if it is not nullable, there is no match.) if !self.exprs.is_nullable(e) { - // The derivative of 'e' is nullable, so 'e' matches. + // No match, so not at end of lexeme all_eoi = false; continue; } + + // If this is the first lazy lexeme, we can cut things short. The first + // lazy lexeme is our lowest, or best, match. We return it and are done. if self.lazy[idx] { let len = self.exprs.possible_lookahead_len(e); return Some((idx, len)); } + + // If we are here, we are greedy matching. + + // If all the greedy lexemes so far are matches. if all_eoi { + // If this greedy lexeme is at end of lexeme ... if self.next_byte.next_byte(&self.exprs, e) == NextByte::ForcedEOI { + // then, if we have not yet found a matching greedy lexeme, set + // this one to be our lowest match ... if eoi_candidate.is_none() { eoi_candidate = Some((idx, self.exprs.possible_lookahead_len(e))); } } else { + // ... otherwise, if this greedy lexeme is not yet a match, then indicate + // that not all greedy lexemes are matches at this point. all_eoi = false; } } } if all_eoi { + // At this point all lexemes are greedy, and are the end of lexeme, + // so there are no further possibilities for greediness. + // We tracked our lowest greedy lexeme in 'eoi_candidate', which we + // now return. eoi_candidate } else { + // For the greedy lexeme finding strategy, possibilities remain, + // so we have not yet settled on a lexeme, and return 'None'. None } }