Skip to content

Commit

Permalink
fix: Incorrect results of CA conversion functions for multibyte UTF-8…
Browse files Browse the repository at this point in the history
… codepoints
  • Loading branch information
slavek-kucera authored Jan 23, 2024
1 parent 0b4f46d commit ad91834
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 39 deletions.
1 change: 1 addition & 0 deletions clients/vscode-hlasmplugin/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#### Fixed
- Inconsistent completion list with implicitly defined private CSECT
- Incorrect results of string comparison in CA expressions
- Incorrect results of CA conversion functions for multibyte UTF-8 codepoints

## [1.11.1](https://github.com/eclipse-che4z/che-che4z-lsp-for-hlasm/compare/1.11.0...1.11.1) (2023-12-04)

Expand Down
17 changes: 9 additions & 8 deletions parser_library/src/ebcdic_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,26 @@

namespace hlasm_plugin::parser_library {

std::pair<unsigned char, const char*> ebcdic_encoding::to_ebcdic_multibyte(const char* c) noexcept
std::pair<unsigned char, const char*> ebcdic_encoding::to_ebcdic_multibyte(const char* c, const char* const ce) noexcept
{
const unsigned char first_byte = *(c + 0);
const unsigned char second_byte = *(c + 1);
if (second_byte == 0)
if (c + 1 == ce)
{
return { EBCDIC_SUB, c + 1 };
}
const unsigned char second_byte = *(c + 1);

if ((first_byte & 0xE0) == 0xC0) // 110xxxxx 10xxxxxx
{
const auto value = ((first_byte & 0x1F) << 6) | (second_byte & 0x3F);
return { value < std::ssize(a2e) ? a2e[value] : EBCDIC_SUB, c + 2 };
}

const unsigned char third_byte = *(c + 2);
if (third_byte == 0)
if (c + 2 == ce)
{
return { EBCDIC_SUB, c + 2 };
}
const unsigned char third_byte = *(c + 2);

if (first_byte == (0b11100000 | ebcdic_encoding::unicode_private >> 4)
&& (second_byte & 0b11111100) == (0x80 | (ebcdic_encoding::unicode_private & 0xF) << 2)
Expand All @@ -50,7 +50,7 @@ std::pair<unsigned char, const char*> ebcdic_encoding::to_ebcdic_multibyte(const
return { EBCDIC_SUB, c + 3 };
}

if (const unsigned char fourth_byte = *(c + 2); fourth_byte == 0)
if (c + 3 == ce)
{
return { EBCDIC_SUB, c + 3 };
}
Expand All @@ -76,9 +76,10 @@ std::string ebcdic_encoding::to_ebcdic(const std::string& s)
{
std::string a;
a.reserve(s.length());
for (const char* i = s.c_str(); *i != 0;)
const auto end = std::to_address(s.end());
for (const char* i = s.data(); i != end;)
{
const auto [ch, newi] = to_ebcdic(i);
const auto [ch, newi] = to_ebcdic(i, end);
a.push_back(ch);
i = newi;
}
Expand Down
6 changes: 3 additions & 3 deletions parser_library/src/ebcdic_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace hlasm_plugin::parser_library {
// Class that provides support for EBCDIC encoding.
class ebcdic_encoding
{
static std::pair<unsigned char, const char*> to_ebcdic_multibyte(const char* c) noexcept;
static std::pair<unsigned char, const char*> to_ebcdic_multibyte(const char* c, const char* ce) noexcept;

// clang-format off
// IBM037 - CR,LF need to be handled separately via private plane
Expand Down Expand Up @@ -74,14 +74,14 @@ class ebcdic_encoding
// Converts an ASCII character to EBCDIC character.
static constexpr unsigned char to_ebcdic(unsigned char c) noexcept { return a2e[c]; }
// Converts an UTF-8 character to EBCDIC character.
static constexpr std::pair<unsigned char, const char*> to_ebcdic(const char* c) noexcept
static constexpr std::pair<unsigned char, const char*> to_ebcdic(const char* c, const char* const ce) noexcept
{
if (const unsigned char first = *c; first < 0x80) [[likely]] // 0xxxxxxx
{
return { a2e[first], c + 1 };
}
else
return to_ebcdic_multibyte(c);
return to_ebcdic_multibyte(c, ce);
}

// Converts UTF-8 string to EBCDIC string.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,8 @@ std::strong_ordering ca_function_binary_operator::compare_string(

while (l != le && r != re)
{
const auto [lc, newl] = ebcdic_encoding::to_ebcdic(l);
const auto [rc, newr] = ebcdic_encoding::to_ebcdic(r);
const auto [lc, newl] = ebcdic_encoding::to_ebcdic(l, le);
const auto [rc, newr] = ebcdic_encoding::to_ebcdic(r, re);

const auto left_update = !right_smaller && lc < rc;
const auto right_update = !left_smaller && lc > rc;
Expand Down Expand Up @@ -335,8 +335,8 @@ bool ca_function_binary_operator::equal_string(const context::C_t& lhs, const co

while (l != le && r != re)
{
const auto [lc, newl] = ebcdic_encoding::to_ebcdic(l);
const auto [rc, newr] = ebcdic_encoding::to_ebcdic(r);
const auto [lc, newl] = ebcdic_encoding::to_ebcdic(l, le);
const auto [rc, newr] = ebcdic_encoding::to_ebcdic(r, re);

if (lc != rc)
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,22 +230,21 @@ context::SET_t ca_function::C2A(std::string_view param, diagnostic_adder& add_di
if (param.empty())
return 0;

static constexpr const size_t limit = 4;
if (param.size() > limit)
RET_ERRPARM;

char buffer[limit + 1] = {};
std::memcpy(buffer, param.data(), param.size());
const char* c = std::to_address(param.cbegin());
const auto ce = std::to_address(param.cend());

context::A_t ret = 0;
for (const char* c = buffer; c != buffer + param.size();)
for (auto n = 0; n < 4 && c != ce; ++n)
{
ret <<= 8;
const auto [ch, newc] = ebcdic_encoding::to_ebcdic(c);
const auto [ch, newc] = ebcdic_encoding::to_ebcdic(c, ce);
ret += ch;
c = newc;
}

if (c != ce)
RET_ERRPARM;

return ret;
}

Expand Down Expand Up @@ -496,17 +495,21 @@ context::SET_t ca_function::C2B(const context::C_t& param, diagnostic_adder& add
if (param.empty())
return "";

if (param.size() * 8 > ca_string::MAX_STR_SIZE)
RET_ERRPARM;
const char* c = std::to_address(param.cbegin());
const auto ce = std::to_address(param.cend());

std::string ret;
ret.reserve(param.size() * 8);
for (const char* c = param.c_str(); c != param.c_str() + param.size();)
ret.reserve(std::min(param.size() * 8, ca_string::MAX_STR_SIZE));
while (c != ce && ret.size() + 8 <= ca_string::MAX_STR_SIZE)
{
const auto [value, newc] = ebcdic_encoding::to_ebcdic(c);
const auto [value, newc] = ebcdic_encoding::to_ebcdic(c, ce);
ret.append(std::bitset<8>(value).to_string());
c = newc;
}

if (c != ce)
RET_ERRPARM;

return ret;
}

Expand All @@ -523,20 +526,24 @@ context::SET_t ca_function::C2X(const context::C_t& param, diagnostic_adder& add
if (param.empty())
return "";

if (param.size() * 2 > ca_string::MAX_STR_SIZE)
RET_ERRPARM;
const char* c = std::to_address(param.cbegin());
const auto ce = std::to_address(param.cend());

std::string ret;
ret.reserve(param.size() * 2);
for (const char* c = param.c_str(); c != param.c_str() + param.size();)
ret.reserve(std::min(param.size() * 2, ca_string::MAX_STR_SIZE));
while (c != ce && ret.size() + 2 <= ca_string::MAX_STR_SIZE)
{
const auto [value, newc] = ebcdic_encoding::to_ebcdic(c);
const auto [value, newc] = ebcdic_encoding::to_ebcdic(c, ce);

ret.push_back("0123456789ABCDEF"[value >> 4]);
ret.push_back("0123456789ABCDEF"[value & 0xf]);

c = newc;
}

if (c != ce)
RET_ERRPARM;

return ret;
}

Expand Down
7 changes: 7 additions & 0 deletions parser_library/test/expressions/ca_function_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ INSTANTIATE_TEST_SUITE_P(func_parameters_suite,
func_test_param { ca_expr_funcs::C2A, { "+" }, 78, false, "C2A_padding" },
func_test_param { ca_expr_funcs::C2A, { "0000" }, -252645136, false, "C2A_whole" },
func_test_param { ca_expr_funcs::C2A, { "00001" }, {}, true, "C2A_exceeds" },
func_test_param {
ca_expr_funcs::C2A,
{ (const char*)u8"\U0001F600\U0001F600\U0001F600\U0001F600" },
0x3F3F3F3F,
false,
"C2A_multibyte",
},

func_test_param { ca_expr_funcs::D2A, { "" }, 0, false, "D2A_empty" },
func_test_param { ca_expr_funcs::D2A, { "000" }, 0, false, "D2A_zeros" },
Expand Down
14 changes: 8 additions & 6 deletions parser_library/test/utf_conv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Broadcom, Inc. - initial API and implementation
*/

#include <memory>
#include <string>

#include "gtest/gtest.h"
Expand Down Expand Up @@ -75,25 +76,26 @@ TEST(ebcdic_encoding, unicode)

u8.insert(u8.end(), (unsigned char)0x41);

//ä
// ä
u8.insert(u8.end(), (unsigned char)0xC3);
u8.insert(u8.end(), (unsigned char)0xA4);

auto begin = u8.c_str();
const auto end = std::to_address(u8.end());

EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin), std::pair(ebcdic_encoding::EBCDIC_SUB, begin + 4));
EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin, end), std::pair(ebcdic_encoding::EBCDIC_SUB, begin + 4));
begin += 4;

EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin), std::pair(ebcdic_encoding::EBCDIC_SUB, begin + 3));
EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin, end), std::pair(ebcdic_encoding::EBCDIC_SUB, begin + 3));
begin += 3;

EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin), std::pair(ebcdic_encoding::EBCDIC_SUB, begin + 2));
EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin, end), std::pair(ebcdic_encoding::EBCDIC_SUB, begin + 2));
begin += 2;

EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin), std::pair((unsigned char)0xC1, begin + 1));
EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin, end), std::pair((unsigned char)0xC1, begin + 1));
begin += 1;

EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin), std::pair((unsigned char)0x43, begin + 2));
EXPECT_EQ(ebcdic_encoding::to_ebcdic(begin, end), std::pair((unsigned char)0x43, begin + 2));
begin += 2;

EXPECT_EQ(begin, std::to_address(u8.end()));
Expand Down

0 comments on commit ad91834

Please sign in to comment.