diff --git a/builtin-functions/kphp-light/regex.txt b/builtin-functions/kphp-light/regex.txt index 50dd282fa1..877ff804ca 100644 --- a/builtin-functions/kphp-light/regex.txt +++ b/builtin-functions/kphp-light/regex.txt @@ -23,13 +23,24 @@ function preg_match_all ($regex ::: string, $subject ::: string, &$matches ::: m function preg_quote ($str ::: string, $delimiter ::: string = ''): string; -function preg_replace ($regex ::: string | array, $replace_val ::: string | array, $subject ::: string | array, $limit ::: int = -1, &$replace_count ::: int = TODO): string | ^3 | false | null; +function preg_replace( + $regex ::: string | array, + $replace_val ::: string | array, + $subject ::: string | array, + $limit ::: int = -1, + &$replace_count ::: int = TODO): string | ^3 | false | null; + +/** @kphp-extern-func-info interruptible */ +function preg_replace_callback( + $regex ::: string | array, + callable(string[] $x):string $callback, + $subject ::: string | array, + $limit ::: int = -1, + &$replace_count ::: int = TODO, + $flags ::: int = 0): string | ^3 | null; // ===== UNSUPPORTED ===== - -function preg_replace_callback ($regex ::: regexp, callable(string[] $x):string $callback, $subject, $limit ::: int = -1, &$replace_count ::: int = TODO) ::: ^3|string|null; - /** @kphp-extern-func-info generate-stub */ function preg_last_error() ::: int; diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index adfd41a91a..10a144414d 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -53,10 +53,11 @@ struct RegexInfo final { // vector of group names regex_pcre2_group_names_t group_names; - int32_t match_count{}; - size_t match_options{PCRE2_NO_UTF_CHECK}; + int64_t match_count{}; + uint32_t match_options{PCRE2_NO_UTF_CHECK}; int64_t replace_count{}; + uint32_t replace_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY}; // contains a string after replacements if replace_count > 0, nullopt otherwise std::optional opt_replace_result; @@ -68,17 +69,9 @@ struct RegexInfo final { , replacement(replacement_) {} }; -bool valid_preg_replace_mixed(const mixed ¶m) noexcept { - if (!param.is_array() && !param.is_string()) [[unlikely]] { - php_warning("invalid parameter: expected to be string or array"); - return false; - } - return true; -} - template requires((std::is_same_v && ...) && sizeof...(Args) > 0) bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { - const bool valid{(flags & ~(supported_flags | ...)) == regex::PREG_NO_FLAGS}; + const bool valid{(flags & ~(supported_flags | ...)) == kphp::regex::PREG_NO_FLAGS}; if (!valid) [[unlikely]] { php_warning("invalid flags: %" PRIi64, flags); } @@ -334,8 +327,8 @@ PCRE2_SIZE set_matches(const RegexInfo ®ex_info, int64_t flags, mixed &matche const auto ®ex_state{RegexInstanceState::get()}; - const auto offset_capture{static_cast(flags & regex::PREG_OFFSET_CAPTURE)}; - const auto unmatched_as_null{static_cast(flags & regex::PREG_UNMATCHED_AS_NULL)}; + const auto offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; // get the ouput vector from the match data const auto *ovector{pcre2_get_ovector_pointer_8(regex_state.regex_pcre2_match_data.get())}; // calculate last matched group @@ -389,7 +382,7 @@ PCRE2_SIZE set_matches(const RegexInfo ®ex_info, int64_t flags, mixed &matche // *** importrant *** // in case of a pattern order all_matches must already contain all groups as empty arrays before the first call to set_all_matches PCRE2_SIZE set_all_matches(const RegexInfo ®ex_info, int64_t flags, mixed &all_matches) noexcept { - const auto pattern_order{!static_cast(flags & regex::PREG_SET_ORDER)}; + const auto pattern_order{!static_cast(flags & kphp::regex::PREG_SET_ORDER)}; mixed matches; PCRE2_SIZE offset{set_matches(regex_info, flags, matches, pattern_order ? trailing_unmatch::include : trailing_unmatch::skip)}; @@ -427,10 +420,10 @@ bool replace_regex(RegexInfo ®ex_info, uint64_t limit) noexcept { // replace all occurences if (limit == std::numeric_limits::max()) [[likely]] { - regex_info.replace_count = - pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), 0, PCRE2_SUBSTITUTE_GLOBAL, - nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), - reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); + regex_info.replace_count = pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), 0, + regex_info.replace_options | PCRE2_SUBSTITUTE_GLOBAL, nullptr, regex_state.match_context.get(), + reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), + reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { std::array buffer{}; @@ -459,7 +452,7 @@ bool replace_regex(RegexInfo ®ex_info, uint64_t limit) noexcept { length_after_replace = buffer_length; if (auto replace_one{pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), - substitute_offset, 0, nullptr, regex_state.match_context.get(), + substitute_offset, regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one != 1) [[unlikely]] { @@ -490,7 +483,7 @@ Optional f$preg_match(const string &pattern, const string &subject, mix matches = array{}; RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; - bool success{valid_regex_flags(flags, regex::PREG_NO_FLAGS, regex::PREG_OFFSET_CAPTURE, regex::PREG_UNMATCHED_AS_NULL)}; + bool success{valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)}; success &= correct_offset(offset, regex_info.subject); success &= parse_regex(regex_info); success &= compile_regex(regex_info); @@ -509,15 +502,15 @@ Optional f$preg_match_all(const string &pattern, const string &subject, int64_t entire_match_count{}; RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; - bool success{valid_regex_flags(flags, regex::PREG_NO_FLAGS, regex::PREG_PATTERN_ORDER, regex::PREG_SET_ORDER, regex::PREG_OFFSET_CAPTURE, - regex::PREG_UNMATCHED_AS_NULL)}; + bool success{valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, + kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)}; success &= correct_offset(offset, regex_info.subject); success &= parse_regex(regex_info); success &= compile_regex(regex_info); success &= collect_group_names(regex_info); // pre-init matches in case of pattern order - if (success && !static_cast(flags & regex::PREG_SET_ORDER)) [[likely]] { + if (success && !static_cast(flags & kphp::regex::PREG_SET_ORDER)) [[likely]] { const array init_val{}; for (const auto *group_name : regex_info.group_names) { if (group_name != nullptr) { @@ -552,7 +545,7 @@ Optional f$preg_match_all(const string &pattern, const string &subject, Optional f$preg_replace(const string &pattern, const string &replacement, const string &subject, int64_t limit, int64_t &count) noexcept { count = 0; - if (limit < 0 && limit != regex::PREG_REPLACE_NOLIMIT) [[unlikely]] { + if (limit < 0 && limit != kphp::regex::PREG_REPLACE_NOLIMIT) [[unlikely]] { php_warning("invalid limit %" PRIi64 " in preg_replace", limit); return {}; } @@ -577,7 +570,7 @@ Optional f$preg_replace(const string &pattern, const string &replacement bool success{parse_regex(regex_info)}; success &= compile_regex(regex_info); - success &= replace_regex(regex_info, limit == regex::PREG_REPLACE_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)); + success &= replace_regex(regex_info, limit == kphp::regex::PREG_REPLACE_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)); if (!success) [[unlikely]] { return {}; } @@ -587,7 +580,7 @@ Optional f$preg_replace(const string &pattern, const string &replacement Optional f$preg_replace(const mixed &pattern, const string &replacement, const string &subject, int64_t limit, int64_t &count) noexcept { count = 0; - if (!valid_preg_replace_mixed(pattern)) [[unlikely]] { + if (!regex_impl_::valid_preg_replace_mixed(pattern)) [[unlikely]] { return {}; } @@ -613,7 +606,7 @@ Optional f$preg_replace(const mixed &pattern, const string &replacement, Optional f$preg_replace(const mixed &pattern, const mixed &replacement, const string &subject, int64_t limit, int64_t &count) noexcept { count = 0; - if (!valid_preg_replace_mixed(pattern) || !valid_preg_replace_mixed(replacement)) [[unlikely]] { + if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(replacement)) [[unlikely]] { return {}; } @@ -652,7 +645,8 @@ Optional f$preg_replace(const mixed &pattern, const mixed &replacement, mixed f$preg_replace(const mixed &pattern, const mixed &replacement, const mixed &subject, int64_t limit, int64_t &count) noexcept { count = 0; - if (!valid_preg_replace_mixed(pattern) || !valid_preg_replace_mixed(replacement) || !valid_preg_replace_mixed(subject)) [[unlikely]] { + if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(replacement) || !regex_impl_::valid_preg_replace_mixed(subject)) + [[unlikely]] { return {}; } diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index 71acb2843e..a04826528e 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -4,12 +4,16 @@ #pragma once +#include #include +#include +#include #include "runtime-common/core/runtime-core.h" +#include "runtime-light/coroutine/task.h" #include "runtime-light/stdlib/string/regex-state.h" -namespace regex { +namespace kphp::regex { inline constexpr int64_t PREG_NO_ERROR = 0; inline constexpr int64_t PREG_INTERNAL_ERROR = 1; @@ -29,68 +33,155 @@ inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); inline constexpr int64_t PREG_REPLACE_NOLIMIT = -1; -} // namespace regex +} // namespace kphp::regex + +namespace regex_impl_ { + +inline bool valid_preg_replace_mixed(const mixed ¶m) noexcept { + if (!param.is_array() && !param.is_string()) [[unlikely]] { + php_warning("invalid parameter: expected to be string or array"); + return false; + } + return true; +} + +} // namespace regex_impl_ using regexp = string; +// === preg_match ================================================================================= + Optional f$preg_match(const string &pattern, const string &subject, mixed &matches = RegexInstanceState::get().default_matches, - int64_t flags = regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; + int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; + +// === preg_match_all ============================================================================= Optional f$preg_match_all(const string &pattern, const string &subject, mixed &matches = RegexInstanceState::get().default_matches, - int64_t flags = regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; - -/* - * PHP's implementation of preg_replace doesn't replace some errors. For example, consider replacement containing - * back reference $123. It cannot be found in the pattern, but PHP doesn't report it as error, it just treats such - * back reference as an empty string. Our implementation warns user about such error and returns null. - */ -Optional f$preg_replace(const string &pattern, const string &replacement, const string &subject, int64_t limit = regex::PREG_REPLACE_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; + +// === preg_replace =============================================================================== + +Optional f$preg_replace(const string &pattern, const string &replacement, const string &subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, int64_t &count = RegexInstanceState::get().default_preg_replace_count) noexcept; -Optional f$preg_replace(const mixed &pattern, const string &replacement, const string &subject, int64_t limit = regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const mixed &pattern, const string &replacement, const string &subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, int64_t &count = RegexInstanceState::get().default_preg_replace_count) noexcept; -Optional f$preg_replace(const mixed &pattern, const mixed &replacement, const string &subject, int64_t limit = regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const mixed &pattern, const mixed &replacement, const string &subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, int64_t &count = RegexInstanceState::get().default_preg_replace_count) noexcept; -mixed f$preg_replace(const mixed &pattern, const mixed &replacement, const mixed &subject, int64_t limit = regex::PREG_REPLACE_NOLIMIT, +mixed f$preg_replace(const mixed &pattern, const mixed &replacement, const mixed &subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, int64_t &count = RegexInstanceState::get().default_preg_replace_count) noexcept; template> -auto f$preg_replace(const T1 ®ex, const T2 &replace_val, const T3 &subject, int64_t limit = regex::PREG_REPLACE_NOLIMIT, +auto f$preg_replace(const T1 ®ex, const T2 &replace_val, const T3 &subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, int64_t &count = RegexInstanceState::get().default_preg_replace_count) noexcept { return f$preg_replace(regex, replace_val, subject.val(), limit, count); } -template> -auto f$preg_replace_callback(const T1 ®ex, const T2 &replace_val, const T3 &subject, int64_t limit = -1, - int64_t &replace_count = RegexInstanceState::get().default_preg_replace_count) { - return f$preg_replace_callback(regex, replace_val, subject.val(), limit, replace_count); +// === preg_replace_callback ====================================================================== + +template> F> +task_t> f$preg_replace_callback(string pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, + int64_t &count = RegexInstanceState::get().default_preg_replace_count, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { + static_assert(std::same_as>, string>); + // the performance of this function can be enhanced: + // 1. don't use public f$preg_match and f$preg_replace; + // 2. use match_regex and replace_regex directly; + // 3. reuse match_data from match_regex in replace_regex. + array matches{}; + { // fill matches array or early return + mixed mixed_matches{}; + const auto match_result{f$preg_match(pattern, subject, mixed_matches, flags, 0)}; + if (!match_result.has_value()) [[unlikely]] { + co_return Optional{}; + } else if (match_result.val() == 0) { // no matches, so just return the subject + co_return std::move(subject); + } + + matches = array{mixed_matches.as_array().size()}; + for (auto &elem : mixed_matches.as_array()) { + matches.set_value(elem.get_key(), std::move(elem.get_value().as_string())); + } + } + + string replacement{}; + if constexpr (is_async_function_v>) { + replacement = co_await std::invoke(callback, std::move(matches)); + } else { + replacement = std::invoke(callback, std::move(matches)); + } + + co_return f$preg_replace(pattern, replacement, subject, limit, count); } -template -Optional f$preg_replace_callback(const regexp & /*unused*/, const T & /*unused*/, const string & /*unused*/, int64_t /*unused*/ = -1, - int64_t & /*unused*/ = RegexInstanceState::get().default_preg_replace_count) { - php_critical_error("call to unsupported function"); +template +task_t> f$preg_replace_callback(mixed pattern, F &&callback, string subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, + int64_t &count = RegexInstanceState::get().default_preg_replace_count, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { + if (!regex_impl_::valid_preg_replace_mixed(pattern)) [[unlikely]] { + co_return Optional{}; + } + + if (pattern.is_string()) { + co_return co_await f$preg_replace_callback(std::move(pattern.as_string()), std::forward(callback), std::move(subject), limit, count, flags); + } + + string result{subject}; + const auto &pattern_arr{pattern.as_array()}; + for (const auto &it : pattern_arr) { + int64_t replace_one_count{}; + if (auto replace_result{co_await f$preg_replace_callback(it.get_value().to_string(), callback, std::move(result), limit, replace_one_count, flags)}; + replace_result.has_value()) [[likely]] { + count += replace_one_count; + result = std::move(replace_result.val()); + } else { + count = 0; + co_return Optional{}; + } + } + + co_return std::move(result); } -template -mixed f$preg_replace_callback(const regexp & /*unused*/, const T & /*unused*/, const mixed & /*unused*/, int64_t /*unused*/ = -1, - int64_t & /*unused*/ = RegexInstanceState::get().default_preg_replace_count) { - php_critical_error("call to unsupported function"); +template +task_t f$preg_replace_callback(mixed pattern, F &&callback, mixed subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, + int64_t &count = RegexInstanceState::get().default_preg_replace_count, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { + if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(subject)) [[unlikely]] { + co_return mixed{}; + } + + if (subject.is_string()) { + co_return co_await f$preg_replace_callback(std::move(pattern), std::forward(callback), std::move(subject.as_string()), limit, count, flags); + } + + const auto &subject_arr{subject.as_array()}; + array result{subject_arr.size()}; + for (const auto &it : subject_arr) { + int64_t replace_one_count{}; + if (auto replace_result{co_await f$preg_replace_callback(pattern, callback, it.get_value().to_string(), limit, replace_one_count, flags)}; + replace_result.has_value()) [[likely]] { + count += replace_one_count; + result.set_value(it.get_key(), std::move(replace_result.val())); + } else { + count = 0; + co_return mixed{}; + } + } + + co_return std::move(result); } -template -Optional f$preg_replace_callback(const mixed & /*unused*/, const T & /*unused*/, const string & /*unused*/, int64_t /*unused*/ = -1, - int64_t & /*unused*/ = RegexInstanceState::get().default_preg_replace_count) { - php_critical_error("call to unsupported function"); +template> +auto f$preg_replace_callback(T1 &&pattern, T2 &&callback, T3 &&subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, + int64_t &count = RegexInstanceState::get().default_preg_replace_count, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept + -> decltype(f$preg_replace_callback(std::forward(pattern), std::forward(callback), std::forward(subject).val(), limit, count, flags)) { + co_return co_await f$preg_replace_callback(std::forward(pattern), std::forward(callback), std::forward(subject).val(), limit, count, flags); } -template -mixed f$preg_replace_callback(const mixed & /*unused*/, const T & /*unused*/, const mixed & /*unused*/, int64_t /*unused*/ = -1, - int64_t & /*unused*/ = RegexInstanceState::get().default_preg_replace_count) { - php_critical_error("call to unsupported function"); -} +// === preg_split ================================================================================= inline Optional> f$preg_split(const string & /*unused*/, const string & /*unused*/, int64_t /*unused*/ = -1, int64_t /*unused*/ = 0) { php_critical_error("call to unsupported function"); diff --git a/tests/phpt/regexp/001_string_result.php b/tests/phpt/regexp/001_string_result.php index 9cd7b310c5..18fd67e5c0 100644 --- a/tests/phpt/regexp/001_string_result.php +++ b/tests/phpt/regexp/001_string_result.php @@ -1,4 +1,4 @@ -@ok k2_skip +@ok