86 lines
2.5 KiB
C++
86 lines
2.5 KiB
C++
//===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/Support/ConvertUTF.h"
|
|
#include "llvm/Support/MemoryBuffer.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
using namespace llvm;
|
|
|
|
int main(int argc, char *argv[]) {
|
|
auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
|
|
if (!ErrorOrBuffer)
|
|
return 1;
|
|
std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
|
|
StringRef Content = Buffer->getBuffer();
|
|
Content = Content.drop_until([](char c) { return c == '#'; });
|
|
SmallVector<StringRef> Lines;
|
|
SplitString(Content, Lines, "\r\n");
|
|
|
|
std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
|
|
SmallVector<StringRef> Values;
|
|
for (StringRef Line : Lines) {
|
|
if (Line.startswith("#"))
|
|
continue;
|
|
|
|
Values.clear();
|
|
Line.split(Values, ';');
|
|
if (Values.size() < 2) {
|
|
errs() << "Failed to parse: " << Line << "\n";
|
|
return 2;
|
|
}
|
|
|
|
llvm::StringRef From = Values[0].trim();
|
|
llvm::UTF32 CodePoint;
|
|
From.getAsInteger(16, CodePoint);
|
|
|
|
SmallVector<llvm::UTF32> To;
|
|
SmallVector<StringRef> ToN;
|
|
Values[1].split(ToN, ' ', -1, false);
|
|
for (StringRef To_ : ToN) {
|
|
llvm::UTF32 ToCodePoint;
|
|
To_.trim().getAsInteger(16, ToCodePoint);
|
|
To.push_back(ToCodePoint);
|
|
}
|
|
// Sentinel
|
|
To.push_back(0);
|
|
|
|
Entries.emplace_back(CodePoint, To);
|
|
}
|
|
llvm::sort(Entries);
|
|
|
|
unsigned LargestValue =
|
|
std::max_element(Entries.begin(), Entries.end(),
|
|
[](const auto &Entry0, const auto &Entry1) {
|
|
return Entry0.second.size() < Entry1.second.size();
|
|
})
|
|
->second.size();
|
|
|
|
std::error_code ec;
|
|
llvm::raw_fd_ostream os(argv[2], ec);
|
|
|
|
// FIXME: If memory consumption and/or lookup time becomes a constraint, it
|
|
// maybe worth using a more elaborate data structure.
|
|
os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
|
|
<< "];} "
|
|
"ConfusableEntries[] = {\n";
|
|
for (const auto &Values : Entries) {
|
|
os << " { ";
|
|
os << Values.first;
|
|
os << ", {";
|
|
for (auto CP : Values.second)
|
|
os << CP << ", ";
|
|
|
|
os << "}},\n";
|
|
}
|
|
os << "};\n";
|
|
return 0;
|
|
}
|