// Copyright 2020 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/regexp/experimental/experimental-compiler.h" #include "src/regexp/experimental/experimental.h" #include "src/zone/zone-list-inl.h" namespace v8 { namespace internal { namespace { // TODO(mbid, v8:10765): Currently the experimental engine doesn't support // UTF-16, but this shouldn't be too hard to implement. constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu; class CanBeHandledVisitor final : private RegExpVisitor { // Visitor to implement `ExperimentalRegExp::CanBeHandled`. public: static bool Check(RegExpTree* tree, JSRegExp::Flags flags, int capture_count) { if (!AreSuitableFlags(flags)) return false; CanBeHandledVisitor visitor; tree->Accept(&visitor, nullptr); return visitor.result_; } private: CanBeHandledVisitor() = default; static bool AreSuitableFlags(JSRegExp::Flags flags) { // TODO(mbid, v8:10765): We should be able to support all flags in the // future. static constexpr JSRegExp::Flags kAllowedFlags = JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline | JSRegExp::kDotAll; // We support Unicode iff kUnicode is among the supported flags. STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode == ((kAllowedFlags & JSRegExp::kUnicode) != 0)); return (flags & ~kAllowedFlags) == 0; } void* VisitDisjunction(RegExpDisjunction* node, void*) override { for (RegExpTree* alt : *node->alternatives()) { alt->Accept(this, nullptr); if (!result_) { return nullptr; } } return nullptr; } void* VisitAlternative(RegExpAlternative* node, void*) override { for (RegExpTree* child : *node->nodes()) { child->Accept(this, nullptr); if (!result_) { return nullptr; } } return nullptr; } void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { result_ = result_ && AreSuitableFlags(node->flags()); return nullptr; } void* VisitAssertion(RegExpAssertion* node, void*) override { result_ = result_ && AreSuitableFlags(node->flags()); return nullptr; } void* VisitAtom(RegExpAtom* node, void*) override { result_ = result_ && AreSuitableFlags(node->flags()); return nullptr; } void* VisitText(RegExpText* node, void*) override { for (TextElement& el : *node->elements()) { el.tree()->Accept(this, nullptr); if (!result_) { return nullptr; } } return nullptr; } void* VisitQuantifier(RegExpQuantifier* node, void*) override { // Finite but large values of `min()` and `max()` are bad for the // breadth-first engine because finite (optional) repetition is dealt with // by replicating the bytecode of the body of the quantifier. The number // of replicatons grows exponentially in how deeply quantifiers are nested. // `replication_factor_` keeps track of how often the current node will // have to be replicated in the generated bytecode, and we don't allow this // to exceed some small value. static constexpr int kMaxReplicationFactor = 16; // First we rule out values for min and max that are too big even before // taking into account the ambient replication_factor_. This also guards // against overflows in `local_replication` or `replication_factor_`. if (node->min() > kMaxReplicationFactor || (node->max() != RegExpTree::kInfinity && node->max() > kMaxReplicationFactor)) { result_ = false; return nullptr; } // Save the current replication factor so that it can be restored if we // return with `result_ == true`. int before_replication_factor = replication_factor_; int local_replication; if (node->max() == RegExpTree::kInfinity) { local_replication = node->min() + 1; } else { local_replication = node->max(); } replication_factor_ *= local_replication; if (replication_factor_ > kMaxReplicationFactor) { result_ = false; return nullptr; } switch (node->quantifier_type()) { case RegExpQuantifier::GREEDY: case RegExpQuantifier::NON_GREEDY: break; case RegExpQuantifier::POSSESSIVE: // TODO(mbid, v8:10765): It's not clear to me whether this can be // supported in breadth-first mode. Re2 doesn't support it. result_ = false; return nullptr; } node->body()->Accept(this, nullptr); replication_factor_ = before_replication_factor; return nullptr; } void* VisitCapture(RegExpCapture* node, void*) override { node->body()->Accept(this, nullptr); return nullptr; } void* VisitGroup(RegExpGroup* node, void*) override { node->body()->Accept(this, nullptr); return nullptr; } void* VisitLookaround(RegExpLookaround* node, void*) override { // TODO(mbid, v8:10765): This will be hard to support, but not impossible I // think. See product automata. result_ = false; return nullptr; } void* VisitBackReference(RegExpBackReference* node, void*) override { // This can't be implemented without backtracking. result_ = false; return nullptr; } void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } private: // See comment in `VisitQuantifier`: int replication_factor_ = 1; bool result_ = true; }; } // namespace bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, int capture_count) { DCHECK(FLAG_enable_experimental_regexp_engine); return CanBeHandledVisitor::Check(tree, flags, capture_count); } namespace experimental { // A label in bytecode with known address. class Label { public: explicit Label(int index) : index_(index) { DCHECK_GE(index_, 0); } int index() { return index_; } // Friend functions because `label.AddForkTo(code, zone)` reads like we're // adding code to where `label` is defined, but we're adding a fork with // target `label` at the end of `code`. friend void AddForkTo(Label target, ZoneList& code, Zone* zone) { code.Add(RegExpInstruction::Fork(target.index_), zone); } friend void AddJmpTo(Label target, ZoneList& code, Zone* zone) { code.Add(RegExpInstruction::Jmp(target.index_), zone); } private: int index_; }; // A label in bytecode whose address is not known yet. The address *must* be // `Bind` before the deferred label object goes out of scope, and the deferred // label object *must not* be used after it was defined. (Use the `Label` // object returned by `Bind` instead.) struct DeferredLabel { // Implemented as a linked list through the `payload.pc` of FORK and JMP // instructions. public: DeferredLabel() = default; ~DeferredLabel() { DCHECK_EQ(patch_list_begin_, kLabelWasDefined); } friend void AddForkTo(DeferredLabel& target, ZoneList& code, Zone* zone) { DCHECK_NE(target.patch_list_begin_, DeferredLabel::kLabelWasDefined); int new_list_begin = code.length(); DCHECK_GE(new_list_begin, 0); code.Add(RegExpInstruction::Fork(target.patch_list_begin_), zone); target.patch_list_begin_ = new_list_begin; } friend void AddJmpTo(DeferredLabel& target, ZoneList& code, Zone* zone) { DCHECK_NE(target.patch_list_begin_, DeferredLabel::kLabelWasDefined); int new_list_begin = code.length(); DCHECK_GE(new_list_begin, 0); code.Add(RegExpInstruction::Jmp(target.patch_list_begin_), zone); target.patch_list_begin_ = new_list_begin; } // Define the deferred label as referring to the next instruction that will // be pushed to `code`. Consumes the DeferredLabel object and returns a // Label object. Label Bind(ZoneList& code) && { DCHECK_NE(patch_list_begin_, kLabelWasDefined); int index = code.length(); while (patch_list_begin_ != kEmptyList) { RegExpInstruction& inst = code[patch_list_begin_]; DCHECK(inst.opcode == RegExpInstruction::FORK || inst.opcode == RegExpInstruction::JMP); patch_list_begin_ = inst.payload.pc; inst.payload.pc = index; } patch_list_begin_ = kLabelWasDefined; return Label(index); } private: static constexpr int kEmptyList = -1; static constexpr int kLabelWasDefined = -2; int patch_list_begin_ = kEmptyList; // Don't copy, don't move. Moving could be implemented, but it's not // needed anywhere. DISALLOW_COPY_AND_ASSIGN(DeferredLabel); }; class CompileVisitor : private RegExpVisitor { public: static ZoneList Compile(RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) { CompileVisitor compiler(zone); if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) { // The match is not anchored, i.e. may start at any input position, so we // emit a preamble corresponding to /.*?/. This skips an arbitrary // prefix in the input non-greedily. compiler.CompileNonGreedyStar([&]() { compiler.code_.Add(RegExpInstruction::ConsumeAnyChar(), zone); }); } compiler.code_.Add(RegExpInstruction::SetRegisterToCp(0), zone); tree->Accept(&compiler, nullptr); compiler.code_.Add(RegExpInstruction::SetRegisterToCp(1), zone); compiler.code_.Add(RegExpInstruction::Accept(), zone); return std::move(compiler.code_); } private: // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from // the `tree` size we're going to compile? explicit CompileVisitor(Zone* zone) : zone_(zone), code_(0, zone) {} // Generate a disjunction of code fragments compiled by a function `alt_gen`. // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - // 1` and should push code corresponding to the ith alternative onto `code_`. template void CompileDisjunction(int alt_num, F&& gen_alt) { // An alternative a1 | ... | an is compiled into // // FORK tail1 // // JMP end // tail1: // FORK tail2 // // JMP end // tail2: // ... // ... // tail{n -1}: // // end: // // By the semantics of the FORK instruction (see above at definition and // semantics), a forked thread has lower priority than the thread that // spawned it. This means that with the code we're generating here, the // thread matching the alternative a1 has indeed highest priority, followed // by the thread for a2 and so on. if (alt_num == 0) { // The empty disjunction. This can never match. code_.Add(RegExpInstruction::Fail(), zone_); return; } DeferredLabel end; for (int i = 0; i != alt_num - 1; ++i) { DeferredLabel tail; AddForkTo(tail, code_, zone_); gen_alt(i); AddJmpTo(end, code_, zone_); std::move(tail).Bind(code_); } gen_alt(alt_num - 1); std::move(end).Bind(code_); } void* VisitDisjunction(RegExpDisjunction* node, void*) override { ZoneList& alts = *node->alternatives(); CompileDisjunction(alts.length(), [&](int i) { alts[i]->Accept(this, nullptr); }); return nullptr; } void* VisitAlternative(RegExpAlternative* node, void*) override { for (RegExpTree* child : *node->nodes()) { child->Accept(this, nullptr); } return nullptr; } void* VisitAssertion(RegExpAssertion* node, void*) override { code_.Add(RegExpInstruction::Assertion(node->assertion_type()), zone_); return nullptr; } void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { // A character class is compiled as Disjunction over its `CharacterRange`s. ZoneList* ranges = node->ranges(zone_); CharacterRange::Canonicalize(ranges); if (node->is_negated()) { // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d) // union of k intervals is a union of at most k + 1 intervals. ZoneList* negated = zone_->New>(ranges->length() + 1, zone_); CharacterRange::Negate(ranges, negated, zone_); DCHECK_LE(negated->length(), ranges->length() + 1); ranges = negated; } CompileDisjunction(ranges->length(), [&](int i) { // We don't support utf16 for now, so only ranges that can be specified // by (complements of) ranges with uc16 bounds. STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits::max()); uc32 from = (*ranges)[i].from(); DCHECK_LE(from, kMaxSupportedCodepoint); uc16 from_uc16 = static_cast(from); uc32 to = (*ranges)[i].to(); DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint); uc16 to_uc16 = static_cast(std::min(to, kMaxSupportedCodepoint)); RegExpInstruction::Uc16Range range{from_uc16, to_uc16}; code_.Add(RegExpInstruction::ConsumeRange(range), zone_); }); return nullptr; } void* VisitAtom(RegExpAtom* node, void*) override { for (uc16 c : node->data()) { code_.Add( RegExpInstruction::ConsumeRange(RegExpInstruction::Uc16Range{c, c}), zone_); } return nullptr; } void ClearRegisters(Interval indices) { if (indices.is_empty()) return; DCHECK_EQ(indices.from() % 2, 0); DCHECK_EQ(indices.to() % 2, 1); for (int i = indices.from(); i <= indices.to(); i += 2) { // It suffices to clear the register containing the `begin` of a capture // because this indicates that the capture is undefined, regardless of // the value in the `end` register. code_.Add(RegExpInstruction::ClearRegister(i), zone_); } } // Emit bytecode corresponding to /*/. template void CompileGreedyStar(F&& emit_body) { // This is compiled into // // begin: // FORK end // // JMP begin // end: // ... // // This is greedy because a forked thread has lower priority than the // thread that spawned it. Label begin(code_.length()); DeferredLabel end; AddForkTo(end, code_, zone_); emit_body(); AddJmpTo(begin, code_, zone_); std::move(end).Bind(code_); } // Emit bytecode corresponding to /*?/. template void CompileNonGreedyStar(F&& emit_body) { // This is compiled into // // FORK body // JMP end // body: // // FORK body // end: // ... Label body(code_.length() + 2); DeferredLabel end; AddForkTo(body, code_, zone_); AddJmpTo(end, code_, zone_); DCHECK_EQ(body.index(), code_.length()); emit_body(); AddForkTo(body, code_, zone_); std::move(end).Bind(code_); } // Emit bytecode corresponding to /{0, max_repetition_num}/. template void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) { // This is compiled into // // FORK end // // FORK end // // ... // ... // FORK end // // end: // ... DeferredLabel end; for (int i = 0; i != max_repetition_num; ++i) { AddForkTo(end, code_, zone_); emit_body(); } std::move(end).Bind(code_); } // Emit bytecode corresponding to /{0, max_repetition_num}?/. template void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) { // This is compiled into // // FORK body0 // JMP end // body0: // // FORK body1 // JMP end // body1: // // ... // ... // body{max_repetition_num - 1}: // // end: // ... DeferredLabel end; for (int i = 0; i != max_repetition_num; ++i) { Label body(code_.length() + 2); AddForkTo(body, code_, zone_); AddJmpTo(end, code_, zone_); DCHECK_EQ(body.index(), code_.length()); emit_body(); } std::move(end).Bind(code_); } void* VisitQuantifier(RegExpQuantifier* node, void*) override { // Emit the body, but clear registers occuring in body first. // // TODO(mbid,v8:10765): It's not always necessary to a) capture registers // and b) clear them. For example, we don't have to capture anything for // the first 4 repetitions if node->min() >= 5, and then we don't have to // clear registers in the first node->min() repetitions. // Later, and if node->min() == 0, we don't have to clear registers before // the first optional repetition. Interval body_registers = node->body()->CaptureRegisters(); auto emit_body = [&]() { ClearRegisters(body_registers); node->body()->Accept(this, nullptr); }; // First repeat the body `min()` times. for (int i = 0; i != node->min(); ++i) emit_body(); switch (node->quantifier_type()) { case RegExpQuantifier::POSSESSIVE: UNREACHABLE(); case RegExpQuantifier::GREEDY: { if (node->max() == RegExpTree::kInfinity) { CompileGreedyStar(emit_body); } else { DCHECK_NE(node->max(), RegExpTree::kInfinity); CompileGreedyRepetition(emit_body, node->max() - node->min()); } break; } case RegExpQuantifier::NON_GREEDY: { if (node->max() == RegExpTree::kInfinity) { CompileNonGreedyStar(emit_body); } else { DCHECK_NE(node->max(), RegExpTree::kInfinity); CompileNonGreedyRepetition(emit_body, node->max() - node->min()); } } } return nullptr; } void* VisitCapture(RegExpCapture* node, void*) override { int index = node->index(); int start_register = RegExpCapture::StartRegister(index); int end_register = RegExpCapture::EndRegister(index); code_.Add(RegExpInstruction::SetRegisterToCp(start_register), zone_); node->body()->Accept(this, nullptr); code_.Add(RegExpInstruction::SetRegisterToCp(end_register), zone_); return nullptr; } void* VisitGroup(RegExpGroup* node, void*) override { node->body()->Accept(this, nullptr); return nullptr; } void* VisitLookaround(RegExpLookaround* node, void*) override { // TODO(mbid,v8:10765): Support this case. UNREACHABLE(); } void* VisitBackReference(RegExpBackReference* node, void*) override { UNREACHABLE(); } void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } void* VisitText(RegExpText* node, void*) override { for (TextElement& text_el : *node->elements()) { text_el.tree()->Accept(this, nullptr); } return nullptr; } private: Zone* zone_; ZoneList code_; }; } // namespace ZoneList ExperimentalRegExpCompiler::Compile( RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) { return experimental::CompileVisitor::Compile(tree, flags, zone); } } // namespace internal } // namespace v8