feat: initial commit
This commit is contained in:
@@ -0,0 +1,25 @@
|
|||||||
|
root = true
|
||||||
|
|
||||||
|
[*.cs]
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
|
||||||
|
[*.{csproj,props,slnx}]
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
|
||||||
|
[*.{ts,tsx,js,json}]
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ci:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup pnpm
|
||||||
|
uses: pnpm/action-setup@v4
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: 24
|
||||||
|
cache: pnpm
|
||||||
|
|
||||||
|
- name: Setup .NET
|
||||||
|
uses: actions/setup-dotnet@v4
|
||||||
|
with:
|
||||||
|
dotnet-version: 10.0.x
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pnpm install --frozen-lockfile
|
||||||
|
|
||||||
|
- name: Typecheck
|
||||||
|
run: pnpm typecheck
|
||||||
|
|
||||||
|
- name: Lint
|
||||||
|
run: pnpm lint
|
||||||
|
|
||||||
|
- name: Test (TypeScript)
|
||||||
|
run: pnpm test
|
||||||
|
|
||||||
|
- name: Test (.NET)
|
||||||
|
run: pnpm test:dotnet
|
||||||
+141
@@ -0,0 +1,141 @@
|
|||||||
|
# Logs
|
||||||
|
logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
lerna-debug.log*
|
||||||
|
|
||||||
|
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||||
|
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
||||||
|
|
||||||
|
# Runtime data
|
||||||
|
pids
|
||||||
|
*.pid
|
||||||
|
*.seed
|
||||||
|
*.pid.lock
|
||||||
|
|
||||||
|
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||||
|
lib-cov
|
||||||
|
|
||||||
|
# Coverage directory used by tools like istanbul
|
||||||
|
coverage
|
||||||
|
*.lcov
|
||||||
|
|
||||||
|
# nyc test coverage
|
||||||
|
.nyc_output
|
||||||
|
|
||||||
|
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||||
|
.grunt
|
||||||
|
|
||||||
|
# Bower dependency directory (https://bower.io/)
|
||||||
|
bower_components
|
||||||
|
|
||||||
|
# node-waf configuration
|
||||||
|
.lock-wscript
|
||||||
|
|
||||||
|
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||||
|
build/Release
|
||||||
|
|
||||||
|
# Dependency directories
|
||||||
|
node_modules/
|
||||||
|
jspm_packages/
|
||||||
|
|
||||||
|
# Snowpack dependency directory (https://snowpack.dev/)
|
||||||
|
web_modules/
|
||||||
|
|
||||||
|
# TypeScript cache
|
||||||
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
# Optional npm cache directory
|
||||||
|
.npm
|
||||||
|
|
||||||
|
# Optional eslint cache
|
||||||
|
.eslintcache
|
||||||
|
|
||||||
|
# Optional stylelint cache
|
||||||
|
.stylelintcache
|
||||||
|
|
||||||
|
# Optional REPL history
|
||||||
|
.node_repl_history
|
||||||
|
|
||||||
|
# Output of 'npm pack'
|
||||||
|
*.tgz
|
||||||
|
|
||||||
|
# Yarn Integrity file
|
||||||
|
.yarn-integrity
|
||||||
|
|
||||||
|
# dotenv environment variable files
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
|
||||||
|
# parcel-bundler cache (https://parceljs.org/)
|
||||||
|
.cache
|
||||||
|
.parcel-cache
|
||||||
|
|
||||||
|
# Next.js build output
|
||||||
|
.next
|
||||||
|
out
|
||||||
|
|
||||||
|
# Nuxt.js build / generate output
|
||||||
|
.nuxt
|
||||||
|
dist
|
||||||
|
.output
|
||||||
|
|
||||||
|
# Gatsby files
|
||||||
|
.cache/
|
||||||
|
# Comment in the public line in if your project uses Gatsby and not Next.js
|
||||||
|
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||||
|
# public
|
||||||
|
|
||||||
|
# vuepress build output
|
||||||
|
.vuepress/dist
|
||||||
|
|
||||||
|
# vuepress v2.x temp and cache directory
|
||||||
|
.temp
|
||||||
|
.cache
|
||||||
|
|
||||||
|
# Sveltekit cache directory
|
||||||
|
.svelte-kit/
|
||||||
|
|
||||||
|
# vitepress build output
|
||||||
|
**/.vitepress/dist
|
||||||
|
|
||||||
|
# vitepress cache directory
|
||||||
|
**/.vitepress/cache
|
||||||
|
|
||||||
|
# Docusaurus cache and generated files
|
||||||
|
.docusaurus
|
||||||
|
|
||||||
|
# Serverless directories
|
||||||
|
.serverless/
|
||||||
|
|
||||||
|
# FuseBox cache
|
||||||
|
.fusebox/
|
||||||
|
|
||||||
|
# DynamoDB Local files
|
||||||
|
.dynamodb/
|
||||||
|
|
||||||
|
# Firebase cache directory
|
||||||
|
.firebase/
|
||||||
|
|
||||||
|
# TernJS port file
|
||||||
|
.tern-port
|
||||||
|
|
||||||
|
# Stores VSCode versions used for testing VSCode extensions
|
||||||
|
.vscode-test
|
||||||
|
|
||||||
|
# yarn v3
|
||||||
|
.pnp.*
|
||||||
|
.yarn/*
|
||||||
|
!.yarn/patches
|
||||||
|
!.yarn/plugins
|
||||||
|
!.yarn/releases
|
||||||
|
!.yarn/sdks
|
||||||
|
!.yarn/versions
|
||||||
|
|
||||||
|
# Vite files
|
||||||
|
vite.config.js.timestamp-*
|
||||||
|
vite.config.ts.timestamp-*
|
||||||
|
.vite/
|
||||||
Vendored
+18
@@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"editor.tabSize": 4,
|
||||||
|
"dotnet.defaultSolution": "dotnet/MaigoLabs.NeedLe.slnx",
|
||||||
|
"files.associations": {
|
||||||
|
"*.slnx": "xml"
|
||||||
|
},
|
||||||
|
"eslint.useFlatConfig": true,
|
||||||
|
"editor.codeActionsOnSave": {
|
||||||
|
"source.fixAll.eslint": "explicit"
|
||||||
|
},
|
||||||
|
"eslint.rules.customizations": [
|
||||||
|
{
|
||||||
|
"rule": "*",
|
||||||
|
"severity": "warn"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"eslint.validate": ["javascript", "javascriptreact", "typescript", "typescriptreact", "vue"]
|
||||||
|
}
|
||||||
@@ -0,0 +1,661 @@
|
|||||||
|
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||||
|
Version 3, 19 November 2007
|
||||||
|
|
||||||
|
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||||
|
Everyone is permitted to copy and distribute verbatim copies
|
||||||
|
of this license document, but changing it is not allowed.
|
||||||
|
|
||||||
|
Preamble
|
||||||
|
|
||||||
|
The GNU Affero General Public License is a free, copyleft license for
|
||||||
|
software and other kinds of works, specifically designed to ensure
|
||||||
|
cooperation with the community in the case of network server software.
|
||||||
|
|
||||||
|
The licenses for most software and other practical works are designed
|
||||||
|
to take away your freedom to share and change the works. By contrast,
|
||||||
|
our General Public Licenses are intended to guarantee your freedom to
|
||||||
|
share and change all versions of a program--to make sure it remains free
|
||||||
|
software for all its users.
|
||||||
|
|
||||||
|
When we speak of free software, we are referring to freedom, not
|
||||||
|
price. Our General Public Licenses are designed to make sure that you
|
||||||
|
have the freedom to distribute copies of free software (and charge for
|
||||||
|
them if you wish), that you receive source code or can get it if you
|
||||||
|
want it, that you can change the software or use pieces of it in new
|
||||||
|
free programs, and that you know you can do these things.
|
||||||
|
|
||||||
|
Developers that use our General Public Licenses protect your rights
|
||||||
|
with two steps: (1) assert copyright on the software, and (2) offer
|
||||||
|
you this License which gives you legal permission to copy, distribute
|
||||||
|
and/or modify the software.
|
||||||
|
|
||||||
|
A secondary benefit of defending all users' freedom is that
|
||||||
|
improvements made in alternate versions of the program, if they
|
||||||
|
receive widespread use, become available for other developers to
|
||||||
|
incorporate. Many developers of free software are heartened and
|
||||||
|
encouraged by the resulting cooperation. However, in the case of
|
||||||
|
software used on network servers, this result may fail to come about.
|
||||||
|
The GNU General Public License permits making a modified version and
|
||||||
|
letting the public access it on a server without ever releasing its
|
||||||
|
source code to the public.
|
||||||
|
|
||||||
|
The GNU Affero General Public License is designed specifically to
|
||||||
|
ensure that, in such cases, the modified source code becomes available
|
||||||
|
to the community. It requires the operator of a network server to
|
||||||
|
provide the source code of the modified version running there to the
|
||||||
|
users of that server. Therefore, public use of a modified version, on
|
||||||
|
a publicly accessible server, gives the public access to the source
|
||||||
|
code of the modified version.
|
||||||
|
|
||||||
|
An older license, called the Affero General Public License and
|
||||||
|
published by Affero, was designed to accomplish similar goals. This is
|
||||||
|
a different license, not a version of the Affero GPL, but Affero has
|
||||||
|
released a new version of the Affero GPL which permits relicensing under
|
||||||
|
this license.
|
||||||
|
|
||||||
|
The precise terms and conditions for copying, distribution and
|
||||||
|
modification follow.
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
0. Definitions.
|
||||||
|
|
||||||
|
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||||
|
|
||||||
|
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||||
|
works, such as semiconductor masks.
|
||||||
|
|
||||||
|
"The Program" refers to any copyrightable work licensed under this
|
||||||
|
License. Each licensee is addressed as "you". "Licensees" and
|
||||||
|
"recipients" may be individuals or organizations.
|
||||||
|
|
||||||
|
To "modify" a work means to copy from or adapt all or part of the work
|
||||||
|
in a fashion requiring copyright permission, other than the making of an
|
||||||
|
exact copy. The resulting work is called a "modified version" of the
|
||||||
|
earlier work or a work "based on" the earlier work.
|
||||||
|
|
||||||
|
A "covered work" means either the unmodified Program or a work based
|
||||||
|
on the Program.
|
||||||
|
|
||||||
|
To "propagate" a work means to do anything with it that, without
|
||||||
|
permission, would make you directly or secondarily liable for
|
||||||
|
infringement under applicable copyright law, except executing it on a
|
||||||
|
computer or modifying a private copy. Propagation includes copying,
|
||||||
|
distribution (with or without modification), making available to the
|
||||||
|
public, and in some countries other activities as well.
|
||||||
|
|
||||||
|
To "convey" a work means any kind of propagation that enables other
|
||||||
|
parties to make or receive copies. Mere interaction with a user through
|
||||||
|
a computer network, with no transfer of a copy, is not conveying.
|
||||||
|
|
||||||
|
An interactive user interface displays "Appropriate Legal Notices"
|
||||||
|
to the extent that it includes a convenient and prominently visible
|
||||||
|
feature that (1) displays an appropriate copyright notice, and (2)
|
||||||
|
tells the user that there is no warranty for the work (except to the
|
||||||
|
extent that warranties are provided), that licensees may convey the
|
||||||
|
work under this License, and how to view a copy of this License. If
|
||||||
|
the interface presents a list of user commands or options, such as a
|
||||||
|
menu, a prominent item in the list meets this criterion.
|
||||||
|
|
||||||
|
1. Source Code.
|
||||||
|
|
||||||
|
The "source code" for a work means the preferred form of the work
|
||||||
|
for making modifications to it. "Object code" means any non-source
|
||||||
|
form of a work.
|
||||||
|
|
||||||
|
A "Standard Interface" means an interface that either is an official
|
||||||
|
standard defined by a recognized standards body, or, in the case of
|
||||||
|
interfaces specified for a particular programming language, one that
|
||||||
|
is widely used among developers working in that language.
|
||||||
|
|
||||||
|
The "System Libraries" of an executable work include anything, other
|
||||||
|
than the work as a whole, that (a) is included in the normal form of
|
||||||
|
packaging a Major Component, but which is not part of that Major
|
||||||
|
Component, and (b) serves only to enable use of the work with that
|
||||||
|
Major Component, or to implement a Standard Interface for which an
|
||||||
|
implementation is available to the public in source code form. A
|
||||||
|
"Major Component", in this context, means a major essential component
|
||||||
|
(kernel, window system, and so on) of the specific operating system
|
||||||
|
(if any) on which the executable work runs, or a compiler used to
|
||||||
|
produce the work, or an object code interpreter used to run it.
|
||||||
|
|
||||||
|
The "Corresponding Source" for a work in object code form means all
|
||||||
|
the source code needed to generate, install, and (for an executable
|
||||||
|
work) run the object code and to modify the work, including scripts to
|
||||||
|
control those activities. However, it does not include the work's
|
||||||
|
System Libraries, or general-purpose tools or generally available free
|
||||||
|
programs which are used unmodified in performing those activities but
|
||||||
|
which are not part of the work. For example, Corresponding Source
|
||||||
|
includes interface definition files associated with source files for
|
||||||
|
the work, and the source code for shared libraries and dynamically
|
||||||
|
linked subprograms that the work is specifically designed to require,
|
||||||
|
such as by intimate data communication or control flow between those
|
||||||
|
subprograms and other parts of the work.
|
||||||
|
|
||||||
|
The Corresponding Source need not include anything that users
|
||||||
|
can regenerate automatically from other parts of the Corresponding
|
||||||
|
Source.
|
||||||
|
|
||||||
|
The Corresponding Source for a work in source code form is that
|
||||||
|
same work.
|
||||||
|
|
||||||
|
2. Basic Permissions.
|
||||||
|
|
||||||
|
All rights granted under this License are granted for the term of
|
||||||
|
copyright on the Program, and are irrevocable provided the stated
|
||||||
|
conditions are met. This License explicitly affirms your unlimited
|
||||||
|
permission to run the unmodified Program. The output from running a
|
||||||
|
covered work is covered by this License only if the output, given its
|
||||||
|
content, constitutes a covered work. This License acknowledges your
|
||||||
|
rights of fair use or other equivalent, as provided by copyright law.
|
||||||
|
|
||||||
|
You may make, run and propagate covered works that you do not
|
||||||
|
convey, without conditions so long as your license otherwise remains
|
||||||
|
in force. You may convey covered works to others for the sole purpose
|
||||||
|
of having them make modifications exclusively for you, or provide you
|
||||||
|
with facilities for running those works, provided that you comply with
|
||||||
|
the terms of this License in conveying all material for which you do
|
||||||
|
not control copyright. Those thus making or running the covered works
|
||||||
|
for you must do so exclusively on your behalf, under your direction
|
||||||
|
and control, on terms that prohibit them from making any copies of
|
||||||
|
your copyrighted material outside their relationship with you.
|
||||||
|
|
||||||
|
Conveying under any other circumstances is permitted solely under
|
||||||
|
the conditions stated below. Sublicensing is not allowed; section 10
|
||||||
|
makes it unnecessary.
|
||||||
|
|
||||||
|
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||||
|
|
||||||
|
No covered work shall be deemed part of an effective technological
|
||||||
|
measure under any applicable law fulfilling obligations under article
|
||||||
|
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||||
|
similar laws prohibiting or restricting circumvention of such
|
||||||
|
measures.
|
||||||
|
|
||||||
|
When you convey a covered work, you waive any legal power to forbid
|
||||||
|
circumvention of technological measures to the extent such circumvention
|
||||||
|
is effected by exercising rights under this License with respect to
|
||||||
|
the covered work, and you disclaim any intention to limit operation or
|
||||||
|
modification of the work as a means of enforcing, against the work's
|
||||||
|
users, your or third parties' legal rights to forbid circumvention of
|
||||||
|
technological measures.
|
||||||
|
|
||||||
|
4. Conveying Verbatim Copies.
|
||||||
|
|
||||||
|
You may convey verbatim copies of the Program's source code as you
|
||||||
|
receive it, in any medium, provided that you conspicuously and
|
||||||
|
appropriately publish on each copy an appropriate copyright notice;
|
||||||
|
keep intact all notices stating that this License and any
|
||||||
|
non-permissive terms added in accord with section 7 apply to the code;
|
||||||
|
keep intact all notices of the absence of any warranty; and give all
|
||||||
|
recipients a copy of this License along with the Program.
|
||||||
|
|
||||||
|
You may charge any price or no price for each copy that you convey,
|
||||||
|
and you may offer support or warranty protection for a fee.
|
||||||
|
|
||||||
|
5. Conveying Modified Source Versions.
|
||||||
|
|
||||||
|
You may convey a work based on the Program, or the modifications to
|
||||||
|
produce it from the Program, in the form of source code under the
|
||||||
|
terms of section 4, provided that you also meet all of these conditions:
|
||||||
|
|
||||||
|
a) The work must carry prominent notices stating that you modified
|
||||||
|
it, and giving a relevant date.
|
||||||
|
|
||||||
|
b) The work must carry prominent notices stating that it is
|
||||||
|
released under this License and any conditions added under section
|
||||||
|
7. This requirement modifies the requirement in section 4 to
|
||||||
|
"keep intact all notices".
|
||||||
|
|
||||||
|
c) You must license the entire work, as a whole, under this
|
||||||
|
License to anyone who comes into possession of a copy. This
|
||||||
|
License will therefore apply, along with any applicable section 7
|
||||||
|
additional terms, to the whole of the work, and all its parts,
|
||||||
|
regardless of how they are packaged. This License gives no
|
||||||
|
permission to license the work in any other way, but it does not
|
||||||
|
invalidate such permission if you have separately received it.
|
||||||
|
|
||||||
|
d) If the work has interactive user interfaces, each must display
|
||||||
|
Appropriate Legal Notices; however, if the Program has interactive
|
||||||
|
interfaces that do not display Appropriate Legal Notices, your
|
||||||
|
work need not make them do so.
|
||||||
|
|
||||||
|
A compilation of a covered work with other separate and independent
|
||||||
|
works, which are not by their nature extensions of the covered work,
|
||||||
|
and which are not combined with it such as to form a larger program,
|
||||||
|
in or on a volume of a storage or distribution medium, is called an
|
||||||
|
"aggregate" if the compilation and its resulting copyright are not
|
||||||
|
used to limit the access or legal rights of the compilation's users
|
||||||
|
beyond what the individual works permit. Inclusion of a covered work
|
||||||
|
in an aggregate does not cause this License to apply to the other
|
||||||
|
parts of the aggregate.
|
||||||
|
|
||||||
|
6. Conveying Non-Source Forms.
|
||||||
|
|
||||||
|
You may convey a covered work in object code form under the terms
|
||||||
|
of sections 4 and 5, provided that you also convey the
|
||||||
|
machine-readable Corresponding Source under the terms of this License,
|
||||||
|
in one of these ways:
|
||||||
|
|
||||||
|
a) Convey the object code in, or embodied in, a physical product
|
||||||
|
(including a physical distribution medium), accompanied by the
|
||||||
|
Corresponding Source fixed on a durable physical medium
|
||||||
|
customarily used for software interchange.
|
||||||
|
|
||||||
|
b) Convey the object code in, or embodied in, a physical product
|
||||||
|
(including a physical distribution medium), accompanied by a
|
||||||
|
written offer, valid for at least three years and valid for as
|
||||||
|
long as you offer spare parts or customer support for that product
|
||||||
|
model, to give anyone who possesses the object code either (1) a
|
||||||
|
copy of the Corresponding Source for all the software in the
|
||||||
|
product that is covered by this License, on a durable physical
|
||||||
|
medium customarily used for software interchange, for a price no
|
||||||
|
more than your reasonable cost of physically performing this
|
||||||
|
conveying of source, or (2) access to copy the
|
||||||
|
Corresponding Source from a network server at no charge.
|
||||||
|
|
||||||
|
c) Convey individual copies of the object code with a copy of the
|
||||||
|
written offer to provide the Corresponding Source. This
|
||||||
|
alternative is allowed only occasionally and noncommercially, and
|
||||||
|
only if you received the object code with such an offer, in accord
|
||||||
|
with subsection 6b.
|
||||||
|
|
||||||
|
d) Convey the object code by offering access from a designated
|
||||||
|
place (gratis or for a charge), and offer equivalent access to the
|
||||||
|
Corresponding Source in the same way through the same place at no
|
||||||
|
further charge. You need not require recipients to copy the
|
||||||
|
Corresponding Source along with the object code. If the place to
|
||||||
|
copy the object code is a network server, the Corresponding Source
|
||||||
|
may be on a different server (operated by you or a third party)
|
||||||
|
that supports equivalent copying facilities, provided you maintain
|
||||||
|
clear directions next to the object code saying where to find the
|
||||||
|
Corresponding Source. Regardless of what server hosts the
|
||||||
|
Corresponding Source, you remain obligated to ensure that it is
|
||||||
|
available for as long as needed to satisfy these requirements.
|
||||||
|
|
||||||
|
e) Convey the object code using peer-to-peer transmission, provided
|
||||||
|
you inform other peers where the object code and Corresponding
|
||||||
|
Source of the work are being offered to the general public at no
|
||||||
|
charge under subsection 6d.
|
||||||
|
|
||||||
|
A separable portion of the object code, whose source code is excluded
|
||||||
|
from the Corresponding Source as a System Library, need not be
|
||||||
|
included in conveying the object code work.
|
||||||
|
|
||||||
|
A "User Product" is either (1) a "consumer product", which means any
|
||||||
|
tangible personal property which is normally used for personal, family,
|
||||||
|
or household purposes, or (2) anything designed or sold for incorporation
|
||||||
|
into a dwelling. In determining whether a product is a consumer product,
|
||||||
|
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||||
|
product received by a particular user, "normally used" refers to a
|
||||||
|
typical or common use of that class of product, regardless of the status
|
||||||
|
of the particular user or of the way in which the particular user
|
||||||
|
actually uses, or expects or is expected to use, the product. A product
|
||||||
|
is a consumer product regardless of whether the product has substantial
|
||||||
|
commercial, industrial or non-consumer uses, unless such uses represent
|
||||||
|
the only significant mode of use of the product.
|
||||||
|
|
||||||
|
"Installation Information" for a User Product means any methods,
|
||||||
|
procedures, authorization keys, or other information required to install
|
||||||
|
and execute modified versions of a covered work in that User Product from
|
||||||
|
a modified version of its Corresponding Source. The information must
|
||||||
|
suffice to ensure that the continued functioning of the modified object
|
||||||
|
code is in no case prevented or interfered with solely because
|
||||||
|
modification has been made.
|
||||||
|
|
||||||
|
If you convey an object code work under this section in, or with, or
|
||||||
|
specifically for use in, a User Product, and the conveying occurs as
|
||||||
|
part of a transaction in which the right of possession and use of the
|
||||||
|
User Product is transferred to the recipient in perpetuity or for a
|
||||||
|
fixed term (regardless of how the transaction is characterized), the
|
||||||
|
Corresponding Source conveyed under this section must be accompanied
|
||||||
|
by the Installation Information. But this requirement does not apply
|
||||||
|
if neither you nor any third party retains the ability to install
|
||||||
|
modified object code on the User Product (for example, the work has
|
||||||
|
been installed in ROM).
|
||||||
|
|
||||||
|
The requirement to provide Installation Information does not include a
|
||||||
|
requirement to continue to provide support service, warranty, or updates
|
||||||
|
for a work that has been modified or installed by the recipient, or for
|
||||||
|
the User Product in which it has been modified or installed. Access to a
|
||||||
|
network may be denied when the modification itself materially and
|
||||||
|
adversely affects the operation of the network or violates the rules and
|
||||||
|
protocols for communication across the network.
|
||||||
|
|
||||||
|
Corresponding Source conveyed, and Installation Information provided,
|
||||||
|
in accord with this section must be in a format that is publicly
|
||||||
|
documented (and with an implementation available to the public in
|
||||||
|
source code form), and must require no special password or key for
|
||||||
|
unpacking, reading or copying.
|
||||||
|
|
||||||
|
7. Additional Terms.
|
||||||
|
|
||||||
|
"Additional permissions" are terms that supplement the terms of this
|
||||||
|
License by making exceptions from one or more of its conditions.
|
||||||
|
Additional permissions that are applicable to the entire Program shall
|
||||||
|
be treated as though they were included in this License, to the extent
|
||||||
|
that they are valid under applicable law. If additional permissions
|
||||||
|
apply only to part of the Program, that part may be used separately
|
||||||
|
under those permissions, but the entire Program remains governed by
|
||||||
|
this License without regard to the additional permissions.
|
||||||
|
|
||||||
|
When you convey a copy of a covered work, you may at your option
|
||||||
|
remove any additional permissions from that copy, or from any part of
|
||||||
|
it. (Additional permissions may be written to require their own
|
||||||
|
removal in certain cases when you modify the work.) You may place
|
||||||
|
additional permissions on material, added by you to a covered work,
|
||||||
|
for which you have or can give appropriate copyright permission.
|
||||||
|
|
||||||
|
Notwithstanding any other provision of this License, for material you
|
||||||
|
add to a covered work, you may (if authorized by the copyright holders of
|
||||||
|
that material) supplement the terms of this License with terms:
|
||||||
|
|
||||||
|
a) Disclaiming warranty or limiting liability differently from the
|
||||||
|
terms of sections 15 and 16 of this License; or
|
||||||
|
|
||||||
|
b) Requiring preservation of specified reasonable legal notices or
|
||||||
|
author attributions in that material or in the Appropriate Legal
|
||||||
|
Notices displayed by works containing it; or
|
||||||
|
|
||||||
|
c) Prohibiting misrepresentation of the origin of that material, or
|
||||||
|
requiring that modified versions of such material be marked in
|
||||||
|
reasonable ways as different from the original version; or
|
||||||
|
|
||||||
|
d) Limiting the use for publicity purposes of names of licensors or
|
||||||
|
authors of the material; or
|
||||||
|
|
||||||
|
e) Declining to grant rights under trademark law for use of some
|
||||||
|
trade names, trademarks, or service marks; or
|
||||||
|
|
||||||
|
f) Requiring indemnification of licensors and authors of that
|
||||||
|
material by anyone who conveys the material (or modified versions of
|
||||||
|
it) with contractual assumptions of liability to the recipient, for
|
||||||
|
any liability that these contractual assumptions directly impose on
|
||||||
|
those licensors and authors.
|
||||||
|
|
||||||
|
All other non-permissive additional terms are considered "further
|
||||||
|
restrictions" within the meaning of section 10. If the Program as you
|
||||||
|
received it, or any part of it, contains a notice stating that it is
|
||||||
|
governed by this License along with a term that is a further
|
||||||
|
restriction, you may remove that term. If a license document contains
|
||||||
|
a further restriction but permits relicensing or conveying under this
|
||||||
|
License, you may add to a covered work material governed by the terms
|
||||||
|
of that license document, provided that the further restriction does
|
||||||
|
not survive such relicensing or conveying.
|
||||||
|
|
||||||
|
If you add terms to a covered work in accord with this section, you
|
||||||
|
must place, in the relevant source files, a statement of the
|
||||||
|
additional terms that apply to those files, or a notice indicating
|
||||||
|
where to find the applicable terms.
|
||||||
|
|
||||||
|
Additional terms, permissive or non-permissive, may be stated in the
|
||||||
|
form of a separately written license, or stated as exceptions;
|
||||||
|
the above requirements apply either way.
|
||||||
|
|
||||||
|
8. Termination.
|
||||||
|
|
||||||
|
You may not propagate or modify a covered work except as expressly
|
||||||
|
provided under this License. Any attempt otherwise to propagate or
|
||||||
|
modify it is void, and will automatically terminate your rights under
|
||||||
|
this License (including any patent licenses granted under the third
|
||||||
|
paragraph of section 11).
|
||||||
|
|
||||||
|
However, if you cease all violation of this License, then your
|
||||||
|
license from a particular copyright holder is reinstated (a)
|
||||||
|
provisionally, unless and until the copyright holder explicitly and
|
||||||
|
finally terminates your license, and (b) permanently, if the copyright
|
||||||
|
holder fails to notify you of the violation by some reasonable means
|
||||||
|
prior to 60 days after the cessation.
|
||||||
|
|
||||||
|
Moreover, your license from a particular copyright holder is
|
||||||
|
reinstated permanently if the copyright holder notifies you of the
|
||||||
|
violation by some reasonable means, this is the first time you have
|
||||||
|
received notice of violation of this License (for any work) from that
|
||||||
|
copyright holder, and you cure the violation prior to 30 days after
|
||||||
|
your receipt of the notice.
|
||||||
|
|
||||||
|
Termination of your rights under this section does not terminate the
|
||||||
|
licenses of parties who have received copies or rights from you under
|
||||||
|
this License. If your rights have been terminated and not permanently
|
||||||
|
reinstated, you do not qualify to receive new licenses for the same
|
||||||
|
material under section 10.
|
||||||
|
|
||||||
|
9. Acceptance Not Required for Having Copies.
|
||||||
|
|
||||||
|
You are not required to accept this License in order to receive or
|
||||||
|
run a copy of the Program. Ancillary propagation of a covered work
|
||||||
|
occurring solely as a consequence of using peer-to-peer transmission
|
||||||
|
to receive a copy likewise does not require acceptance. However,
|
||||||
|
nothing other than this License grants you permission to propagate or
|
||||||
|
modify any covered work. These actions infringe copyright if you do
|
||||||
|
not accept this License. Therefore, by modifying or propagating a
|
||||||
|
covered work, you indicate your acceptance of this License to do so.
|
||||||
|
|
||||||
|
10. Automatic Licensing of Downstream Recipients.
|
||||||
|
|
||||||
|
Each time you convey a covered work, the recipient automatically
|
||||||
|
receives a license from the original licensors, to run, modify and
|
||||||
|
propagate that work, subject to this License. You are not responsible
|
||||||
|
for enforcing compliance by third parties with this License.
|
||||||
|
|
||||||
|
An "entity transaction" is a transaction transferring control of an
|
||||||
|
organization, or substantially all assets of one, or subdividing an
|
||||||
|
organization, or merging organizations. If propagation of a covered
|
||||||
|
work results from an entity transaction, each party to that
|
||||||
|
transaction who receives a copy of the work also receives whatever
|
||||||
|
licenses to the work the party's predecessor in interest had or could
|
||||||
|
give under the previous paragraph, plus a right to possession of the
|
||||||
|
Corresponding Source of the work from the predecessor in interest, if
|
||||||
|
the predecessor has it or can get it with reasonable efforts.
|
||||||
|
|
||||||
|
You may not impose any further restrictions on the exercise of the
|
||||||
|
rights granted or affirmed under this License. For example, you may
|
||||||
|
not impose a license fee, royalty, or other charge for exercise of
|
||||||
|
rights granted under this License, and you may not initiate litigation
|
||||||
|
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||||
|
any patent claim is infringed by making, using, selling, offering for
|
||||||
|
sale, or importing the Program or any portion of it.
|
||||||
|
|
||||||
|
11. Patents.
|
||||||
|
|
||||||
|
A "contributor" is a copyright holder who authorizes use under this
|
||||||
|
License of the Program or a work on which the Program is based. The
|
||||||
|
work thus licensed is called the contributor's "contributor version".
|
||||||
|
|
||||||
|
A contributor's "essential patent claims" are all patent claims
|
||||||
|
owned or controlled by the contributor, whether already acquired or
|
||||||
|
hereafter acquired, that would be infringed by some manner, permitted
|
||||||
|
by this License, of making, using, or selling its contributor version,
|
||||||
|
but do not include claims that would be infringed only as a
|
||||||
|
consequence of further modification of the contributor version. For
|
||||||
|
purposes of this definition, "control" includes the right to grant
|
||||||
|
patent sublicenses in a manner consistent with the requirements of
|
||||||
|
this License.
|
||||||
|
|
||||||
|
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||||
|
patent license under the contributor's essential patent claims, to
|
||||||
|
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||||
|
propagate the contents of its contributor version.
|
||||||
|
|
||||||
|
In the following three paragraphs, a "patent license" is any express
|
||||||
|
agreement or commitment, however denominated, not to enforce a patent
|
||||||
|
(such as an express permission to practice a patent or covenant not to
|
||||||
|
sue for patent infringement). To "grant" such a patent license to a
|
||||||
|
party means to make such an agreement or commitment not to enforce a
|
||||||
|
patent against the party.
|
||||||
|
|
||||||
|
If you convey a covered work, knowingly relying on a patent license,
|
||||||
|
and the Corresponding Source of the work is not available for anyone
|
||||||
|
to copy, free of charge and under the terms of this License, through a
|
||||||
|
publicly available network server or other readily accessible means,
|
||||||
|
then you must either (1) cause the Corresponding Source to be so
|
||||||
|
available, or (2) arrange to deprive yourself of the benefit of the
|
||||||
|
patent license for this particular work, or (3) arrange, in a manner
|
||||||
|
consistent with the requirements of this License, to extend the patent
|
||||||
|
license to downstream recipients. "Knowingly relying" means you have
|
||||||
|
actual knowledge that, but for the patent license, your conveying the
|
||||||
|
covered work in a country, or your recipient's use of the covered work
|
||||||
|
in a country, would infringe one or more identifiable patents in that
|
||||||
|
country that you have reason to believe are valid.
|
||||||
|
|
||||||
|
If, pursuant to or in connection with a single transaction or
|
||||||
|
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||||
|
covered work, and grant a patent license to some of the parties
|
||||||
|
receiving the covered work authorizing them to use, propagate, modify
|
||||||
|
or convey a specific copy of the covered work, then the patent license
|
||||||
|
you grant is automatically extended to all recipients of the covered
|
||||||
|
work and works based on it.
|
||||||
|
|
||||||
|
A patent license is "discriminatory" if it does not include within
|
||||||
|
the scope of its coverage, prohibits the exercise of, or is
|
||||||
|
conditioned on the non-exercise of one or more of the rights that are
|
||||||
|
specifically granted under this License. You may not convey a covered
|
||||||
|
work if you are a party to an arrangement with a third party that is
|
||||||
|
in the business of distributing software, under which you make payment
|
||||||
|
to the third party based on the extent of your activity of conveying
|
||||||
|
the work, and under which the third party grants, to any of the
|
||||||
|
parties who would receive the covered work from you, a discriminatory
|
||||||
|
patent license (a) in connection with copies of the covered work
|
||||||
|
conveyed by you (or copies made from those copies), or (b) primarily
|
||||||
|
for and in connection with specific products or compilations that
|
||||||
|
contain the covered work, unless you entered into that arrangement,
|
||||||
|
or that patent license was granted, prior to 28 March 2007.
|
||||||
|
|
||||||
|
Nothing in this License shall be construed as excluding or limiting
|
||||||
|
any implied license or other defenses to infringement that may
|
||||||
|
otherwise be available to you under applicable patent law.
|
||||||
|
|
||||||
|
12. No Surrender of Others' Freedom.
|
||||||
|
|
||||||
|
If conditions are imposed on you (whether by court order, agreement or
|
||||||
|
otherwise) that contradict the conditions of this License, they do not
|
||||||
|
excuse you from the conditions of this License. If you cannot convey a
|
||||||
|
covered work so as to satisfy simultaneously your obligations under this
|
||||||
|
License and any other pertinent obligations, then as a consequence you may
|
||||||
|
not convey it at all. For example, if you agree to terms that obligate you
|
||||||
|
to collect a royalty for further conveying from those to whom you convey
|
||||||
|
the Program, the only way you could satisfy both those terms and this
|
||||||
|
License would be to refrain entirely from conveying the Program.
|
||||||
|
|
||||||
|
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||||
|
|
||||||
|
Notwithstanding any other provision of this License, if you modify the
|
||||||
|
Program, your modified version must prominently offer all users
|
||||||
|
interacting with it remotely through a computer network (if your version
|
||||||
|
supports such interaction) an opportunity to receive the Corresponding
|
||||||
|
Source of your version by providing access to the Corresponding Source
|
||||||
|
from a network server at no charge, through some standard or customary
|
||||||
|
means of facilitating copying of software. This Corresponding Source
|
||||||
|
shall include the Corresponding Source for any work covered by version 3
|
||||||
|
of the GNU General Public License that is incorporated pursuant to the
|
||||||
|
following paragraph.
|
||||||
|
|
||||||
|
Notwithstanding any other provision of this License, you have
|
||||||
|
permission to link or combine any covered work with a work licensed
|
||||||
|
under version 3 of the GNU General Public License into a single
|
||||||
|
combined work, and to convey the resulting work. The terms of this
|
||||||
|
License will continue to apply to the part which is the covered work,
|
||||||
|
but the work with which it is combined will remain governed by version
|
||||||
|
3 of the GNU General Public License.
|
||||||
|
|
||||||
|
14. Revised Versions of this License.
|
||||||
|
|
||||||
|
The Free Software Foundation may publish revised and/or new versions of
|
||||||
|
the GNU Affero General Public License from time to time. Such new versions
|
||||||
|
will be similar in spirit to the present version, but may differ in detail to
|
||||||
|
address new problems or concerns.
|
||||||
|
|
||||||
|
Each version is given a distinguishing version number. If the
|
||||||
|
Program specifies that a certain numbered version of the GNU Affero General
|
||||||
|
Public License "or any later version" applies to it, you have the
|
||||||
|
option of following the terms and conditions either of that numbered
|
||||||
|
version or of any later version published by the Free Software
|
||||||
|
Foundation. If the Program does not specify a version number of the
|
||||||
|
GNU Affero General Public License, you may choose any version ever published
|
||||||
|
by the Free Software Foundation.
|
||||||
|
|
||||||
|
If the Program specifies that a proxy can decide which future
|
||||||
|
versions of the GNU Affero General Public License can be used, that proxy's
|
||||||
|
public statement of acceptance of a version permanently authorizes you
|
||||||
|
to choose that version for the Program.
|
||||||
|
|
||||||
|
Later license versions may give you additional or different
|
||||||
|
permissions. However, no additional obligations are imposed on any
|
||||||
|
author or copyright holder as a result of your choosing to follow a
|
||||||
|
later version.
|
||||||
|
|
||||||
|
15. Disclaimer of Warranty.
|
||||||
|
|
||||||
|
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||||
|
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||||
|
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||||
|
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||||
|
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||||
|
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||||
|
|
||||||
|
16. Limitation of Liability.
|
||||||
|
|
||||||
|
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||||
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||||
|
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||||
|
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||||
|
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||||
|
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||||
|
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||||
|
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||||
|
SUCH DAMAGES.
|
||||||
|
|
||||||
|
17. Interpretation of Sections 15 and 16.
|
||||||
|
|
||||||
|
If the disclaimer of warranty and limitation of liability provided
|
||||||
|
above cannot be given local legal effect according to their terms,
|
||||||
|
reviewing courts shall apply local law that most closely approximates
|
||||||
|
an absolute waiver of all civil liability in connection with the
|
||||||
|
Program, unless a warranty or assumption of liability accompanies a
|
||||||
|
copy of the Program in return for a fee.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
How to Apply These Terms to Your New Programs
|
||||||
|
|
||||||
|
If you develop a new program, and you want it to be of the greatest
|
||||||
|
possible use to the public, the best way to achieve this is to make it
|
||||||
|
free software which everyone can redistribute and change under these terms.
|
||||||
|
|
||||||
|
To do so, attach the following notices to the program. It is safest
|
||||||
|
to attach them to the start of each source file to most effectively
|
||||||
|
state the exclusion of warranty; and each file should have at least
|
||||||
|
the "copyright" line and a pointer to where the full notice is found.
|
||||||
|
|
||||||
|
<one line to give the program's name and a brief idea of what it does.>
|
||||||
|
Copyright (C) <year> <name of author>
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as published
|
||||||
|
by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
Also add information on how to contact you by electronic and paper mail.
|
||||||
|
|
||||||
|
If your software can interact with users remotely through a computer
|
||||||
|
network, you should also make sure that it provides a way for users to
|
||||||
|
get its source. For example, if your program is a web application, its
|
||||||
|
interface could display a "Source" link that leads users to an archive
|
||||||
|
of the code. There are many ways you could offer source, and different
|
||||||
|
solutions will be better for different programs; see section 13 for the
|
||||||
|
specific requirements.
|
||||||
|
|
||||||
|
You should also get your employer (if you work as a programmer) or school,
|
||||||
|
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||||
|
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||||
|
<https://www.gnu.org/licenses/>.
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
# needLe
|
||||||
|
|
||||||
|
Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.
|
||||||
|
|
||||||
|
Available in [TypeScript](./packages/needle) and [C#](./dotnet). Click the link for detailed documentation.
|
||||||
|
|
||||||
|
See also [in-browser demo](https://needle.maigo.dev).
|
||||||
|
|
||||||
|
## Packages
|
||||||
|
|
||||||
|
| Platform | Package | Install |
|
||||||
|
|:--------:|:-------:|:-------:|
|
||||||
|
| Node.js / Browser | [@maigolabs/needle](https://www.npmjs.com/package/@maigolabs/needle) | `pnpm add @maigolabs/needle` |
|
||||||
|
| .NET Standard 2.0 | [MaigoLabs.NeedLe](https://www.nuget.org/packages/MaigoLabs.NeedLe) | `dotnet add package MaigoLabs.NeedLe` |
|
||||||
|
|
||||||
|
## The Name
|
||||||
|
|
||||||
|
The word "needle" is from the phrase [Needle in a Haystack](https://en.wikipedia.org/wiki/Needle_in_a_haystack). Normally, searching tasks are finding a small string ("needle") in a large string ("haystack"). However, this project is designed for searching in small strings (specifically, music names) instead of large strings. We are finding needles in needles.
|
||||||
|
|
||||||
|
The capitalized "L" is from the music name [needLe](https://projectsekai.fandom.com/wiki/NeedLe).
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>MaigoLabs :: needLe</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="root"></div>
|
||||||
|
<script type="module" src="/src/main.tsx"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"name": "@maigolabs/needle-demo",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"typecheck": "tsc",
|
||||||
|
"dev": "vite --port 5172",
|
||||||
|
"build": "tsc -b && vite build"
|
||||||
|
},
|
||||||
|
"license": "AGPL-3.0",
|
||||||
|
"packageManager": "pnpm@10.20.0",
|
||||||
|
"private": true,
|
||||||
|
"dependencies": {
|
||||||
|
"@maigolabs/needle": "workspace:*",
|
||||||
|
"react": "^19.2.0",
|
||||||
|
"react-dom": "^19.2.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@iconify-json/svg-spinners": "^1.2.4",
|
||||||
|
"@types/node": "^24.10.1",
|
||||||
|
"@types/react": "^19.2.5",
|
||||||
|
"@types/react-dom": "^19.2.3",
|
||||||
|
"@vitejs/plugin-react": "^5.1.1",
|
||||||
|
"unocss": "^66.5.12",
|
||||||
|
"vite": "^7.2.4",
|
||||||
|
"vite-plugin-top-level-await": "^1.6.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
/fonts
|
||||||
@@ -0,0 +1,168 @@
|
|||||||
|
import { TokenType } from '@maigolabs/needle/common';
|
||||||
|
import {
|
||||||
|
searchInvertedIndex,
|
||||||
|
highlightSearchResult,
|
||||||
|
type SearchResult,
|
||||||
|
} from '@maigolabs/needle/searcher';
|
||||||
|
import { useState, type FunctionComponent } from 'react';
|
||||||
|
|
||||||
|
type Tab = 'search' | 'tokenize';
|
||||||
|
|
||||||
|
type AppData = typeof import('./data');
|
||||||
|
export const Layout: FunctionComponent<{ dataPromise: Promise<AppData> }> = ({ dataPromise }) => {
|
||||||
|
const [appData, setAppData] = useState<AppData | null>(null);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
void dataPromise.then(props => setAppData(props)).catch(error => setError((error instanceof Error ? error.stack : undefined) ?? String(error)));
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen bg-[#f9f2e0] text-[#8b7355] font-mono selection:bg-[#d4c4b0]/70">
|
||||||
|
<div className="max-w-200 mx-auto px-4 pt-8 pb-6">
|
||||||
|
<header className="mb-8">
|
||||||
|
<h1 className="pb-3 text-2xl text-[#a08060]">MaigoLabs :: needLe</h1>
|
||||||
|
<div className="pb-4 text-sm">
|
||||||
|
<p>Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support</p>
|
||||||
|
<p>(Available in TypeScript and C#)</p>
|
||||||
|
</div>
|
||||||
|
<div className="flex gap-4 text-sm">
|
||||||
|
<a href="https://github.com/MaigoLabs/needLe" target="_blank" rel="noopener" className="text-[#b8a890] hover:text-[#8b7355]">[GitHub]</a>
|
||||||
|
<a href="https://www.npmjs.com/package/@maigolabs/needle" target="_blank" rel="noopener" className="text-[#b8a890] hover:text-[#8b7355]">[NPM]</a>
|
||||||
|
<a href="https://www.nuget.org/packages/MaigoLabs.NeedLe" target="_blank" rel="noopener" className="text-[#b8a890] hover:text-[#8b7355]">[NuGet]</a>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
{
|
||||||
|
appData
|
||||||
|
? <App appData={appData} />
|
||||||
|
: error
|
||||||
|
? <div className="text-sm bg-[#efe5d0] px-4 py-3 rounded-lg whitespace-pre-wrap">{error}</div>
|
||||||
|
: <div>
|
||||||
|
<div className="flex flex-row items-center gap-2"><div className="i-svg-spinners:ring-resize" /> Loading...</div>
|
||||||
|
<div className="mt-6 text-sm bg-[#efe5d0] px-4 py-3 rounded-lg">
|
||||||
|
<div className="font-bold mb-2">Tips:</div>
|
||||||
|
<div>This demo loads Kuromoji/OpenCC/pinyin-pro for tokenization and index building.</div>
|
||||||
|
<div>However, searching on a prebuilt index doesn't require loading any external library/dictionary.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
interface AppProps {
|
||||||
|
appData: AppData;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const App: FunctionComponent<AppProps> = ({ appData: { kuromoji, createTokenizer, invertedIndex } }) => {
|
||||||
|
const [input, setInput] = useState('');
|
||||||
|
const [tab, setTab] = useState<Tab>('search');
|
||||||
|
|
||||||
|
const searchResults = tab === 'search' && input.trim()
|
||||||
|
? searchInvertedIndex(invertedIndex, input).slice(0, 50)
|
||||||
|
: [];
|
||||||
|
|
||||||
|
const tokenizeResults = tab === 'tokenize' && input.trim()
|
||||||
|
? (() => {
|
||||||
|
const tokenizer = createTokenizer({ kuromoji });
|
||||||
|
const tokens = tokenizer.tokenize(input);
|
||||||
|
const tokenDefs = tokenizer.tokens;
|
||||||
|
const codePoints = [...input];
|
||||||
|
return tokens.map(t => {
|
||||||
|
const def = [...tokenDefs.values()].find(d => d.id === t.id)!;
|
||||||
|
const original = codePoints.slice(t.start, t.end).join('');
|
||||||
|
return { ...t, type: def.type, text: def.text, original };
|
||||||
|
});
|
||||||
|
})()
|
||||||
|
: [];
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
value={input}
|
||||||
|
onChange={e => setInput(e.target.value)}
|
||||||
|
placeholder={`Type something to ${tab}...`}
|
||||||
|
className="w-full bg-[#efe5d0] text-[#6b5a48] px-3 py-2 mb-2 outline-none placeholder-[#b8a890] rounded-lg"
|
||||||
|
/>
|
||||||
|
|
||||||
|
<div className="flex gap-4 mb-6 text-sm">
|
||||||
|
<button
|
||||||
|
onClick={() => setTab('search')}
|
||||||
|
className={`bg-transparent border-none cursor-pointer ${tab === 'search' ? 'text-[#6b5a48]' : 'text-[#c0b0a0]'}`}
|
||||||
|
>
|
||||||
|
Search
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => setTab('tokenize')}
|
||||||
|
className={`bg-transparent border-none cursor-pointer ${tab === 'tokenize' ? 'text-[#6b5a48]' : 'text-[#c0b0a0]'}`}
|
||||||
|
>
|
||||||
|
Tokenize
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-2">
|
||||||
|
{tab === 'search' && searchResults.map((result, i) => (
|
||||||
|
<SearchResultItem key={i} result={result} input={input} />
|
||||||
|
))}
|
||||||
|
|
||||||
|
{tab === 'tokenize' && tokenizeResults.length > 0 && (
|
||||||
|
<div className="grid grid-cols-[repeat(auto-fill,minmax(280px,1fr))] gap-1">
|
||||||
|
{tokenizeResults.map((token, i) => (
|
||||||
|
<div key={i} className="bg-[#efe5d0] px-3 py-2 text-sm truncate rounded-lg">
|
||||||
|
<span className="text-[#a08060]">{TokenType[token.type]}: </span>
|
||||||
|
<span className="text-[#6b5a48]">{JSON.stringify(token.text)}</span>
|
||||||
|
<span className="text-[#c0b0a0]">{' <- '}</span>
|
||||||
|
<span className="text-[#8b7355]">{JSON.stringify(token.original)}</span>
|
||||||
|
<span className="text-[#c8bba8]">{` [${token.start}, ${token.end}]`}</span>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{input.trim() && tab === 'search' && searchResults.length === 0 && (
|
||||||
|
<div className="text-[#b8a890] text-sm">No results.</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const SearchResultItem: FunctionComponent<{ result: SearchResult; input: string }> = ({ result, input }) => {
|
||||||
|
const highlighted = highlightSearchResult(result);
|
||||||
|
const inputCodePoints = [...input];
|
||||||
|
|
||||||
|
const stats = [
|
||||||
|
`${result.rangeCount} range(s)`,
|
||||||
|
`${Math.round(result.matchRatio * 100)}%`,
|
||||||
|
result.prefixMatchCount > 0 ? `${result.prefixMatchCount} prefix` : null,
|
||||||
|
].filter(Boolean).join(', ');
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="bg-[#efe5d0] px-3 py-2 text-sm rounded-lg">
|
||||||
|
<div className="flex gap-2">
|
||||||
|
<div className="flex-1 truncate">
|
||||||
|
{highlighted.map((part, i) =>
|
||||||
|
typeof part === 'string'
|
||||||
|
? <span key={i} className="text-[#b8a890]">{part}</span>
|
||||||
|
: <span key={i} className="text-[#5a4a38]">{part.highlight}</span>)}
|
||||||
|
</div>
|
||||||
|
<div className="text-[#c8bba8] shrink-0">{stats}</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="grid grid-cols-[repeat(auto-fill,minmax(200px,1fr))] gap-x-2 mt-1">
|
||||||
|
{result.tokens.map((token, i) => {
|
||||||
|
const inputText = inputCodePoints.slice(token.inputOffset.start, token.inputOffset.end).join('');
|
||||||
|
const docText = result.documentCodePoints.slice(token.documentOffset.start, token.documentOffset.end).join('');
|
||||||
|
return (
|
||||||
|
<div key={i} className="text-[11px] truncate">
|
||||||
|
<span className="text-[#b8a890]">{TokenType[token.definition.type]}: </span>
|
||||||
|
<span className="text-[#8b7355]">{JSON.stringify(inputText)}</span>
|
||||||
|
<span className="text-[#c8bba8]">{' -> '}</span>
|
||||||
|
<span className="text-[#6b5a48]">{JSON.stringify(docText)}</span>
|
||||||
|
{token.isTokenPrefixMatching && <span className="text-[#b8a890]">{' (prefix)'}</span>}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
import { buildInvertedIndex } from '@maigolabs/needle/indexer';
|
||||||
|
import { loadInvertedIndex } from '@maigolabs/needle/searcher';
|
||||||
|
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
|
||||||
|
// Indexer loads OpenCC and pinyin-pro which is large, put them in data.ts for dynamic importing.
|
||||||
|
export { createTokenizer } from '@maigolabs/needle/indexer';
|
||||||
|
|
||||||
|
const musicNames: string[] = [...new Set(
|
||||||
|
Object.values(
|
||||||
|
await (await fetch('https://sekai-world.github.io/sekai-master-db-diff/musics.json')).json(),
|
||||||
|
).map(music => (music as { title: string }).title),
|
||||||
|
)];
|
||||||
|
|
||||||
|
export const kuromoji = await new TokenizerBuilder({
|
||||||
|
loader: {
|
||||||
|
loadArrayBuffer: async (url: string) => {
|
||||||
|
url = `https://cdn.jsdelivr.net/npm/@aiktb/kuromoji@1.0.2/dict/${url.replace('.gz', '')}`;
|
||||||
|
const res = await fetch(url);
|
||||||
|
if (!res.ok) throw new Error(`Failed to fetch ${url}`);
|
||||||
|
return await res.arrayBuffer();
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).build();
|
||||||
|
|
||||||
|
export const compressed = buildInvertedIndex(musicNames, { kuromoji });
|
||||||
|
export const invertedIndex = loadInvertedIndex(compressed);
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
import { StrictMode } from 'react';
|
||||||
|
import { createRoot } from 'react-dom/client';
|
||||||
|
|
||||||
|
import { Layout } from './App';
|
||||||
|
import 'virtual:uno.css';
|
||||||
|
import '@unocss/reset/tailwind.css';
|
||||||
|
|
||||||
|
createRoot(document.getElementById('root')!).render(
|
||||||
|
<StrictMode>
|
||||||
|
<Layout dataPromise={import('./data')} />
|
||||||
|
</StrictMode>,
|
||||||
|
);
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ESNext",
|
||||||
|
"jsx": "preserve",
|
||||||
|
"lib": ["DOM", "DOM.Iterable", "ESNext", "WebWorker"],
|
||||||
|
"types": ["vite/client"],
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleResolution": "Bundler",
|
||||||
|
"noUncheckedIndexedAccess": true,
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"allowJs": true,
|
||||||
|
"strict": true,
|
||||||
|
"strictNullChecks": true,
|
||||||
|
"noEmit": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"isolatedModules": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"rootDir": ".",
|
||||||
|
"outDir": "dist"
|
||||||
|
},
|
||||||
|
"include": ["src/**/*.ts", "src/**/*.tsx"],
|
||||||
|
"exclude": ["dist", "node_modules"]
|
||||||
|
}
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
import { createLocalFontProcessor } from '@unocss/preset-web-fonts/local';
|
||||||
|
import { defineConfig, presetWind3, presetTypography, presetWebFonts, transformerVariantGroup, transformerDirectives, presetIcons } from 'unocss';
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
presets: [
|
||||||
|
presetWind3(),
|
||||||
|
presetTypography(),
|
||||||
|
presetIcons({
|
||||||
|
scale: 1.2,
|
||||||
|
warn: true,
|
||||||
|
}),
|
||||||
|
presetWebFonts({
|
||||||
|
fonts: {
|
||||||
|
mono: {
|
||||||
|
name: 'Maple Mono',
|
||||||
|
provider: 'fontsource',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
processors: createLocalFontProcessor({
|
||||||
|
cacheDir: 'node_modules/.cache/unocss/fonts',
|
||||||
|
fontAssetsDir: 'public/assets/fonts/cache',
|
||||||
|
fontServeBaseUrl: '/assets/fonts/cache',
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
transformers: [
|
||||||
|
transformerDirectives(),
|
||||||
|
transformerVariantGroup(),
|
||||||
|
],
|
||||||
|
});
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
import { defineConfig } from 'vite'
|
||||||
|
import UnoCSS from 'unocss/vite'
|
||||||
|
import react from '@vitejs/plugin-react'
|
||||||
|
import topLevelAwait from 'vite-plugin-top-level-await'
|
||||||
|
|
||||||
|
// https://vite.dev/config/
|
||||||
|
export default defineConfig({
|
||||||
|
plugins: [react(), UnoCSS(), topLevelAwait()],
|
||||||
|
build: {
|
||||||
|
assetsInlineLimit: 0,
|
||||||
|
minify: true
|
||||||
|
},
|
||||||
|
})
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
{
|
||||||
|
"name": "@maigolabs/needle-playground-bot",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"start": "tsx src/index.ts",
|
||||||
|
"typecheck": "tsc"
|
||||||
|
},
|
||||||
|
"license": "AGPL-3.0",
|
||||||
|
"packageManager": "pnpm@10.20.0",
|
||||||
|
"private": true,
|
||||||
|
"dependencies": {
|
||||||
|
"@maigolabs/needle": "workspace:*",
|
||||||
|
"telegraf": "^4.16.3"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^24.10.4"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,78 @@
|
|||||||
|
import fs from 'node:fs';
|
||||||
|
import path from 'node:path';
|
||||||
|
import url from 'node:url';
|
||||||
|
|
||||||
|
import { TokenType } from '@maigolabs/needle/common';
|
||||||
|
import { buildInvertedIndex, createTokenizer } from '@maigolabs/needle/indexer';
|
||||||
|
import { loadInvertedIndex, inspectSearchResult, searchInvertedIndex } from '@maigolabs/needle/searcher';
|
||||||
|
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||||
|
import { Telegraf } from 'telegraf';
|
||||||
|
|
||||||
|
const botToken = process.env.TELEGRAM_BOT_TOKEN!;
|
||||||
|
const targetChatId = parseInt(process.env.TARGET_CHAT_ID!);
|
||||||
|
if (!botToken || isNaN(targetChatId)) throw new Error('Missing environment variables TELEGRAM_BOT_TOKEN or TARGET_CHAT_ID');
|
||||||
|
|
||||||
|
const bot = new Telegraf(botToken);
|
||||||
|
|
||||||
|
const escapeHtml = (s: string) => s.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
||||||
|
|
||||||
|
const commands = await (async () => {
|
||||||
|
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||||
|
const kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||||
|
|
||||||
|
const documents = (await fs.promises.readFile('../../example.txt', 'utf-8')).split('\n').filter(line => line.length > 0);
|
||||||
|
const startBuildInvertedIndex = performance.now();
|
||||||
|
const compressed = buildInvertedIndex(documents, { kuromoji });
|
||||||
|
const endBuildInvertedIndex = performance.now();
|
||||||
|
console.log(`Built inverted index in ${endBuildInvertedIndex - startBuildInvertedIndex}ms`);
|
||||||
|
|
||||||
|
const startLoadInvertedIndex = performance.now();
|
||||||
|
const invertedIndex = loadInvertedIndex(compressed);
|
||||||
|
const endLoadInvertedIndex = performance.now();
|
||||||
|
console.log(`Loaded inverted index in ${endLoadInvertedIndex - startLoadInvertedIndex}ms`);
|
||||||
|
|
||||||
|
const codify = (text: string) => `<code>${escapeHtml(text)}</code>`;
|
||||||
|
return {
|
||||||
|
needle: (text: string) => {
|
||||||
|
const startSearch = performance.now();
|
||||||
|
const results = searchInvertedIndex(invertedIndex, text);
|
||||||
|
const endSearch = performance.now();
|
||||||
|
const searchDuration = (endSearch - startSearch).toFixed(3);
|
||||||
|
const showingResults = results.slice(0, 5);
|
||||||
|
return results.length === 0 ? codify(`No results found after ${searchDuration}ms`) : [
|
||||||
|
codify(`Search completed in ${searchDuration}ms, showing ${showingResults.length}/${results.length} results:\n`),
|
||||||
|
...showingResults.map(result => inspectSearchResult(result, true)),
|
||||||
|
].join('\n').trimEnd();
|
||||||
|
},
|
||||||
|
tokenize: (text: string) => {
|
||||||
|
const startTokenize = performance.now();
|
||||||
|
const tokenizer = createTokenizer({ kuromoji });
|
||||||
|
const tokens = tokenizer.tokenize(text);
|
||||||
|
const tokenDefinitions = [...tokenizer.tokens.values()];
|
||||||
|
const endTokenize = performance.now();
|
||||||
|
const tokenizeDuration = (endTokenize - startTokenize).toFixed(3);
|
||||||
|
return codify(tokens.length === 0 ? `No tokens emitted after ${tokenizeDuration}ms` : [
|
||||||
|
`Tokenization completed in ${tokenizeDuration}ms, emitted ${tokens.length} tokens:`,
|
||||||
|
...tokens
|
||||||
|
.map(token => [tokenDefinitions[token.id]!, token, [...text].slice(token.start, token.end).join('')] as const)
|
||||||
|
.map(([token, { start, end }, originalPhrase]) => ` ${TokenType[token.type]}: ${JSON.stringify(token.text)} <- ${JSON.stringify(originalPhrase)} [${start}, ${end}]`),
|
||||||
|
].join('\n'));
|
||||||
|
},
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
|
||||||
|
bot.on('message', async ctx => {
|
||||||
|
const text = 'text' in ctx.message ? ctx.message.text : undefined;
|
||||||
|
console.log(`${ctx.chat.id ?? 'N/A'}:${ctx.from!.id} ${JSON.stringify(text)}`);
|
||||||
|
if (ctx.chat.id === targetChatId) {
|
||||||
|
if (text?.startsWith('/needle ')) {
|
||||||
|
await ctx.reply(commands.needle(text.slice('/needle '.length)), { parse_mode: 'HTML' });
|
||||||
|
} else if (text?.startsWith('/tokenize ')) {
|
||||||
|
await ctx.reply(commands.tokenize(text.slice('/tokenize '.length)), { parse_mode: 'HTML' });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await bot.launch();
|
||||||
|
void bot.telegram.getMe().then(me => console.log(`Bot logged in as ${me.first_name} (@${me.username})`));
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ESNext",
|
||||||
|
"jsx": "preserve",
|
||||||
|
"lib": ["DOM", "DOM.Iterable", "ESNext", "WebWorker"],
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleResolution": "Bundler",
|
||||||
|
"noUncheckedIndexedAccess": true,
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"allowJs": true,
|
||||||
|
"strict": true,
|
||||||
|
"strictNullChecks": true,
|
||||||
|
"noEmit": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"isolatedModules": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"rootDir": ".",
|
||||||
|
"outDir": "dist"
|
||||||
|
},
|
||||||
|
"include": ["src/**/*.ts"],
|
||||||
|
"exclude": ["dist", "node_modules"]
|
||||||
|
}
|
||||||
@@ -0,0 +1,371 @@
|
|||||||
|
|
||||||
|
# Created by https://www.toptal.com/developers/gitignore/api/git,visualstudio
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=git,visualstudio
|
||||||
|
|
||||||
|
### Git ###
|
||||||
|
# Created by git for backups. To disable backups in Git:
|
||||||
|
# $ git config --global mergetool.keepBackup false
|
||||||
|
*.orig
|
||||||
|
|
||||||
|
# Created by git when using merge tools for conflicts
|
||||||
|
*.BACKUP.*
|
||||||
|
*.BASE.*
|
||||||
|
*.LOCAL.*
|
||||||
|
*.REMOTE.*
|
||||||
|
*_BACKUP_*.txt
|
||||||
|
*_BASE_*.txt
|
||||||
|
*_LOCAL_*.txt
|
||||||
|
*_REMOTE_*.txt
|
||||||
|
|
||||||
|
### VisualStudio ###
|
||||||
|
## Ignore Visual Studio temporary files, build results, and
|
||||||
|
## files generated by popular Visual Studio add-ons.
|
||||||
|
##
|
||||||
|
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||||
|
|
||||||
|
# User-specific files
|
||||||
|
*.rsuser
|
||||||
|
*.suo
|
||||||
|
*.user
|
||||||
|
*.userosscache
|
||||||
|
*.sln.docstates
|
||||||
|
|
||||||
|
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||||
|
*.userprefs
|
||||||
|
|
||||||
|
# Mono auto generated files
|
||||||
|
mono_crash.*
|
||||||
|
|
||||||
|
# Build results
|
||||||
|
[Dd]ebug/
|
||||||
|
[Dd]ebugPublic/
|
||||||
|
[Rr]elease/
|
||||||
|
[Rr]eleases/
|
||||||
|
x64/
|
||||||
|
x86/
|
||||||
|
[Aa][Rr][Mm]/
|
||||||
|
[Aa][Rr][Mm]64/
|
||||||
|
bld/
|
||||||
|
[Bb]in/
|
||||||
|
[Oo]bj/
|
||||||
|
[Ll]og/
|
||||||
|
[Ll]ogs/
|
||||||
|
|
||||||
|
# Visual Studio 2015/2017 cache/options directory
|
||||||
|
.vs/
|
||||||
|
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||||
|
#wwwroot/
|
||||||
|
|
||||||
|
# Visual Studio 2017 auto generated files
|
||||||
|
Generated\ Files/
|
||||||
|
|
||||||
|
# MSTest test Results
|
||||||
|
[Tt]est[Rr]esult*/
|
||||||
|
[Bb]uild[Ll]og.*
|
||||||
|
|
||||||
|
# NUnit
|
||||||
|
*.VisualState.xml
|
||||||
|
TestResult.xml
|
||||||
|
nunit-*.xml
|
||||||
|
|
||||||
|
# Build Results of an ATL Project
|
||||||
|
[Dd]ebugPS/
|
||||||
|
[Rr]eleasePS/
|
||||||
|
dlldata.c
|
||||||
|
|
||||||
|
# Benchmark Results
|
||||||
|
BenchmarkDotNet.Artifacts/
|
||||||
|
|
||||||
|
# .NET Core
|
||||||
|
project.lock.json
|
||||||
|
project.fragment.lock.json
|
||||||
|
artifacts/
|
||||||
|
|
||||||
|
# StyleCop
|
||||||
|
StyleCopReport.xml
|
||||||
|
|
||||||
|
# Files built by Visual Studio
|
||||||
|
*_i.c
|
||||||
|
*_p.c
|
||||||
|
*_h.h
|
||||||
|
*.ilk
|
||||||
|
*.meta
|
||||||
|
*.obj
|
||||||
|
*.iobj
|
||||||
|
*.pch
|
||||||
|
*.pdb
|
||||||
|
*.ipdb
|
||||||
|
*.pgc
|
||||||
|
*.pgd
|
||||||
|
*.rsp
|
||||||
|
*.sbr
|
||||||
|
*.tlb
|
||||||
|
*.tli
|
||||||
|
*.tlh
|
||||||
|
*.tmp
|
||||||
|
*.tmp_proj
|
||||||
|
*_wpftmp.csproj
|
||||||
|
*.log
|
||||||
|
*.vspscc
|
||||||
|
*.vssscc
|
||||||
|
.builds
|
||||||
|
*.pidb
|
||||||
|
*.svclog
|
||||||
|
*.scc
|
||||||
|
|
||||||
|
# Chutzpah Test files
|
||||||
|
_Chutzpah*
|
||||||
|
|
||||||
|
# Visual C++ cache files
|
||||||
|
ipch/
|
||||||
|
*.aps
|
||||||
|
*.ncb
|
||||||
|
*.opendb
|
||||||
|
*.opensdf
|
||||||
|
*.sdf
|
||||||
|
*.cachefile
|
||||||
|
*.VC.db
|
||||||
|
*.VC.VC.opendb
|
||||||
|
|
||||||
|
# Visual Studio profiler
|
||||||
|
*.psess
|
||||||
|
*.vsp
|
||||||
|
*.vspx
|
||||||
|
*.sap
|
||||||
|
|
||||||
|
# Visual Studio Trace Files
|
||||||
|
*.e2e
|
||||||
|
|
||||||
|
# TFS 2012 Local Workspace
|
||||||
|
$tf/
|
||||||
|
|
||||||
|
# Guidance Automation Toolkit
|
||||||
|
*.gpState
|
||||||
|
|
||||||
|
# ReSharper is a .NET coding add-in
|
||||||
|
_ReSharper*/
|
||||||
|
*.[Rr]e[Ss]harper
|
||||||
|
*.DotSettings.user
|
||||||
|
|
||||||
|
# TeamCity is a build add-in
|
||||||
|
_TeamCity*
|
||||||
|
|
||||||
|
# DotCover is a Code Coverage Tool
|
||||||
|
*.dotCover
|
||||||
|
|
||||||
|
# AxoCover is a Code Coverage Tool
|
||||||
|
.axoCover/*
|
||||||
|
!.axoCover/settings.json
|
||||||
|
|
||||||
|
# Coverlet is a free, cross platform Code Coverage Tool
|
||||||
|
coverage*[.json, .xml, .info]
|
||||||
|
|
||||||
|
# Visual Studio code coverage results
|
||||||
|
*.coverage
|
||||||
|
*.coveragexml
|
||||||
|
|
||||||
|
# NCrunch
|
||||||
|
_NCrunch_*
|
||||||
|
.*crunch*.local.xml
|
||||||
|
nCrunchTemp_*
|
||||||
|
|
||||||
|
# MightyMoose
|
||||||
|
*.mm.*
|
||||||
|
AutoTest.Net/
|
||||||
|
|
||||||
|
# Web workbench (sass)
|
||||||
|
.sass-cache/
|
||||||
|
|
||||||
|
# Installshield output folder
|
||||||
|
[Ee]xpress/
|
||||||
|
|
||||||
|
# DocProject is a documentation generator add-in
|
||||||
|
DocProject/buildhelp/
|
||||||
|
DocProject/Help/*.HxT
|
||||||
|
DocProject/Help/*.HxC
|
||||||
|
DocProject/Help/*.hhc
|
||||||
|
DocProject/Help/*.hhk
|
||||||
|
DocProject/Help/*.hhp
|
||||||
|
DocProject/Help/Html2
|
||||||
|
DocProject/Help/html
|
||||||
|
|
||||||
|
# Click-Once directory
|
||||||
|
publish/
|
||||||
|
|
||||||
|
# Publish Web Output
|
||||||
|
*.[Pp]ublish.xml
|
||||||
|
*.azurePubxml
|
||||||
|
# Note: Comment the next line if you want to checkin your web deploy settings,
|
||||||
|
# but database connection strings (with potential passwords) will be unencrypted
|
||||||
|
*.pubxml
|
||||||
|
*.publishproj
|
||||||
|
|
||||||
|
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||||
|
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||||
|
# in these scripts will be unencrypted
|
||||||
|
PublishScripts/
|
||||||
|
|
||||||
|
# NuGet Packages
|
||||||
|
*.nupkg
|
||||||
|
# NuGet Symbol Packages
|
||||||
|
*.snupkg
|
||||||
|
# Uncomment if necessary however generally it will be regenerated when needed
|
||||||
|
#!**/[Pp]ackages/repositories.config
|
||||||
|
# NuGet v3's project.json files produces more ignorable files
|
||||||
|
*.nuget.props
|
||||||
|
*.nuget.targets
|
||||||
|
|
||||||
|
# Microsoft Azure Build Output
|
||||||
|
csx/
|
||||||
|
*.build.csdef
|
||||||
|
|
||||||
|
# Microsoft Azure Emulator
|
||||||
|
ecf/
|
||||||
|
rcf/
|
||||||
|
|
||||||
|
# Windows Store app package directories and files
|
||||||
|
AppPackages/
|
||||||
|
BundleArtifacts/
|
||||||
|
Package.StoreAssociation.xml
|
||||||
|
_pkginfo.txt
|
||||||
|
*.appx
|
||||||
|
*.appxbundle
|
||||||
|
*.appxupload
|
||||||
|
|
||||||
|
# Visual Studio cache files
|
||||||
|
# files ending in .cache can be ignored
|
||||||
|
*.[Cc]ache
|
||||||
|
# but keep track of directories ending in .cache
|
||||||
|
!?*.[Cc]ache/
|
||||||
|
|
||||||
|
# Others
|
||||||
|
ClientBin/
|
||||||
|
~$*
|
||||||
|
*~
|
||||||
|
*.dbmdl
|
||||||
|
*.dbproj.schemaview
|
||||||
|
*.jfm
|
||||||
|
*.pfx
|
||||||
|
*.publishsettings
|
||||||
|
orleans.codegen.cs
|
||||||
|
|
||||||
|
# Including strong name files can present a security risk
|
||||||
|
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||||
|
#*.snk
|
||||||
|
|
||||||
|
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||||
|
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||||
|
#bower_components/
|
||||||
|
|
||||||
|
# RIA/Silverlight projects
|
||||||
|
Generated_Code/
|
||||||
|
|
||||||
|
# Backup & report files from converting an old project file
|
||||||
|
# to a newer Visual Studio version. Backup files are not needed,
|
||||||
|
# because we have git ;-)
|
||||||
|
_UpgradeReport_Files/
|
||||||
|
Backup*/
|
||||||
|
UpgradeLog*.XML
|
||||||
|
UpgradeLog*.htm
|
||||||
|
ServiceFabricBackup/
|
||||||
|
*.rptproj.bak
|
||||||
|
|
||||||
|
# SQL Server files
|
||||||
|
*.mdf
|
||||||
|
*.ldf
|
||||||
|
*.ndf
|
||||||
|
|
||||||
|
# Business Intelligence projects
|
||||||
|
*.rdl.data
|
||||||
|
*.bim.layout
|
||||||
|
*.bim_*.settings
|
||||||
|
*.rptproj.rsuser
|
||||||
|
*- [Bb]ackup.rdl
|
||||||
|
*- [Bb]ackup ([0-9]).rdl
|
||||||
|
*- [Bb]ackup ([0-9][0-9]).rdl
|
||||||
|
|
||||||
|
# Microsoft Fakes
|
||||||
|
FakesAssemblies/
|
||||||
|
|
||||||
|
# GhostDoc plugin setting file
|
||||||
|
*.GhostDoc.xml
|
||||||
|
|
||||||
|
# Node.js Tools for Visual Studio
|
||||||
|
.ntvs_analysis.dat
|
||||||
|
node_modules/
|
||||||
|
|
||||||
|
# Visual Studio 6 build log
|
||||||
|
*.plg
|
||||||
|
|
||||||
|
# Visual Studio 6 workspace options file
|
||||||
|
*.opt
|
||||||
|
|
||||||
|
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||||
|
*.vbw
|
||||||
|
|
||||||
|
# Visual Studio LightSwitch build output
|
||||||
|
**/*.HTMLClient/GeneratedArtifacts
|
||||||
|
**/*.DesktopClient/GeneratedArtifacts
|
||||||
|
**/*.DesktopClient/ModelManifest.xml
|
||||||
|
**/*.Server/GeneratedArtifacts
|
||||||
|
**/*.Server/ModelManifest.xml
|
||||||
|
_Pvt_Extensions
|
||||||
|
|
||||||
|
# Paket dependency manager
|
||||||
|
.paket/paket.exe
|
||||||
|
paket-files/
|
||||||
|
|
||||||
|
# FAKE - F# Make
|
||||||
|
.fake/
|
||||||
|
|
||||||
|
# CodeRush personal settings
|
||||||
|
.cr/personal
|
||||||
|
|
||||||
|
# Python Tools for Visual Studio (PTVS)
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
|
||||||
|
# Cake - Uncomment if you are using it
|
||||||
|
# tools/**
|
||||||
|
# !tools/packages.config
|
||||||
|
|
||||||
|
# Tabs Studio
|
||||||
|
*.tss
|
||||||
|
|
||||||
|
# Telerik's JustMock configuration file
|
||||||
|
*.jmconfig
|
||||||
|
|
||||||
|
# BizTalk build output
|
||||||
|
*.btp.cs
|
||||||
|
*.btm.cs
|
||||||
|
*.odx.cs
|
||||||
|
*.xsd.cs
|
||||||
|
|
||||||
|
# OpenCover UI analysis results
|
||||||
|
OpenCover/
|
||||||
|
|
||||||
|
# Azure Stream Analytics local run output
|
||||||
|
ASALocalRun/
|
||||||
|
|
||||||
|
# MSBuild Binary and Structured Log
|
||||||
|
*.binlog
|
||||||
|
|
||||||
|
# NVidia Nsight GPU debugger configuration file
|
||||||
|
*.nvuser
|
||||||
|
|
||||||
|
# MFractors (Xamarin productivity tool) working folder
|
||||||
|
.mfractor/
|
||||||
|
|
||||||
|
# Local History for Visual Studio
|
||||||
|
.localhistory/
|
||||||
|
|
||||||
|
# BeatPulse healthcheck temp database
|
||||||
|
healthchecksdb
|
||||||
|
|
||||||
|
# Backup folder for Package Reference Convert tool in Visual Studio 2017
|
||||||
|
MigrationBackup/
|
||||||
|
|
||||||
|
# Ionide (cross platform F# VS Code tools) working folder
|
||||||
|
.ionide/
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/git,visualstudio
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
<Project>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<PreserveCompilationContext>true</PreserveCompilationContext>
|
||||||
|
<LangVersion>14</LangVersion>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<NoWarn>CA1822</NoWarn>
|
||||||
|
<ProjectName>MaigoLabs.NeedLe</ProjectName>
|
||||||
|
<VSTestLogger>console%3Bverbosity=detailed</VSTestLogger>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<IsPackable>false</IsPackable>
|
||||||
|
<Version>1.0.0</Version>
|
||||||
|
<Authors>Menci</Authors>
|
||||||
|
<Description>Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support</Description>
|
||||||
|
<PackageLicenseExpression>AGPL-3.0-only</PackageLicenseExpression>
|
||||||
|
<RepositoryUrl>https://github.com/MaigoLabs/needLe</RepositoryUrl>
|
||||||
|
<RepositoryType>git</RepositoryType>
|
||||||
|
<PackageProjectUrl>https://github.com/MaigoLabs/needLe</PackageProjectUrl>
|
||||||
|
<PackageTags>search;fuzzy;cjk;chinese;japanese;pinyin;romaji</PackageTags>
|
||||||
|
<PackageReadmeFile>README.md</PackageReadmeFile>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<None Include="..\README.md" Pack="true" PackagePath="\" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
<Project>
|
||||||
|
<PropertyGroup>
|
||||||
|
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="18.0.1" />
|
||||||
|
<PackageVersion Include="coverlet.collector" Version="6.0.4" />
|
||||||
|
<PackageVersion Include="xunit" Version="2.9.3" />
|
||||||
|
<PackageVersion Include="xunit.runner.visualstudio" Version="3.1.5" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageVersion Include="DotNetCampus.LatestCSharpFeatures" Version="13.0.1" />
|
||||||
|
<PackageVersion Include="hyjiacan.pinyin4net" Version="4.1.1" />
|
||||||
|
<PackageVersion Include="MeCab.DotNet" Version="1.2.0" />
|
||||||
|
<PackageVersion Include="MyNihongo.KanaConverter" Version="1.0.5" />
|
||||||
|
<PackageVersion Include="OpenccNetLib" Version="1.4.0" />
|
||||||
|
<PackageVersion Include="Telegram.Bot" Version="22.5.0" />
|
||||||
|
</ItemGroup>
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Common;
|
||||||
|
|
||||||
|
// This is for global normalization for any input and documents.
|
||||||
|
public static class CommonNormalization
|
||||||
|
{
|
||||||
|
public static int NormalizeCodePoint(int codePoint)
|
||||||
|
{
|
||||||
|
// Fullwidth ASCII -> Halfwidth ASCII
|
||||||
|
if (codePoint >= 0xFF01 && codePoint <= 0xFF5E) return ToLowerCaseAscii(codePoint - 0xFEE0);
|
||||||
|
// Fullwidth space -> Halfwidth space
|
||||||
|
else if (codePoint == /* ' ' */ 0x3000) return ' ';
|
||||||
|
// Halfwidth kana (U+FF66 - U+FF9D) -> Fullwidth kana
|
||||||
|
else if (codePoint >= 0xFF66 && codePoint <= 0xFF9D) return HALF_TO_FULL_KANA.TryGetValue(codePoint, out var value) ? value : codePoint;
|
||||||
|
else if (codePoint == /* '。' */ 0xFF61) return '。';
|
||||||
|
else if (codePoint == /* '「' */ 0xFF62) return '「';
|
||||||
|
else if (codePoint == /* '」' */ 0xFF63) return '」';
|
||||||
|
else if (codePoint == /* '、' */ 0xFF64) return '、';
|
||||||
|
else if (codePoint == /* '・' */ 0xFF65) return '・';
|
||||||
|
else if (codePoint == /* '゙' */ 0xFF9E || codePoint == /* '゛' */ 0x309B) return 0x3099; // -> COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
|
||||||
|
else if (codePoint == /* '゚' */ 0xFF9F || codePoint == /* '゜' */ 0x309C) return 0x309A; // -> COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||||
|
else return ToLowerCaseAscii(codePoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static readonly Dictionary<int, int> HALF_TO_FULL_KANA = new Dictionary<int, int> {
|
||||||
|
['ヲ'] = 'ヲ', ['ァ'] = 'ァ', ['ィ'] = 'ィ', ['ゥ'] = 'ゥ', ['ェ'] = 'ェ', ['ォ'] = 'ォ',
|
||||||
|
['ャ'] = 'ャ', ['ュ'] = 'ュ', ['ョ'] = 'ョ', ['ッ'] = 'ッ',
|
||||||
|
['ー'] = 'ー',
|
||||||
|
['ア'] = 'ア', ['イ'] = 'イ', ['ウ'] = 'ウ', ['エ'] = 'エ', ['オ'] = 'オ',
|
||||||
|
['カ'] = 'カ', ['キ'] = 'キ', ['ク'] = 'ク', ['ケ'] = 'ケ', ['コ'] = 'コ',
|
||||||
|
['サ'] = 'サ', ['シ'] = 'シ', ['ス'] = 'ス', ['セ'] = 'セ', ['ソ'] = 'ソ',
|
||||||
|
['タ'] = 'タ', ['チ'] = 'チ', ['ツ'] = 'ツ', ['テ'] = 'テ', ['ト'] = 'ト',
|
||||||
|
['ナ'] = 'ナ', ['ニ'] = 'ニ', ['ヌ'] = 'ヌ', ['ネ'] = 'ネ', ['ノ'] = 'ノ',
|
||||||
|
['ハ'] = 'ハ', ['ヒ'] = 'ヒ', ['フ'] = 'フ', ['ヘ'] = 'ヘ', ['ホ'] = 'ホ',
|
||||||
|
['マ'] = 'マ', ['ミ'] = 'ミ', ['ム'] = 'ム', ['メ'] = 'メ', ['モ'] = 'モ',
|
||||||
|
['ヤ'] = 'ヤ', ['ユ'] = 'ユ', ['ヨ'] = 'ヨ',
|
||||||
|
['ラ'] = 'ラ', ['リ'] = 'リ', ['ル'] = 'ル', ['レ'] = 'レ', ['ロ'] = 'ロ',
|
||||||
|
['ワ'] = 'ワ', ['ン'] = 'ン',
|
||||||
|
};
|
||||||
|
|
||||||
|
public static int ToLowerCaseAscii(int codePoint) => codePoint >= 0x41 && codePoint <= 0x5A ? codePoint + 0x20 : codePoint;
|
||||||
|
|
||||||
|
public static bool IsHiraganaRange(int codePoint) => (codePoint >= 0x3041 && codePoint <= 0x3096) || (codePoint >= 0x309D && codePoint <= 0x309E);
|
||||||
|
public static int ToKatakana(int codePoint) => IsHiraganaRange(codePoint) ? codePoint + 0x60 : codePoint;
|
||||||
|
public static string ToKatakana(string text) => string.Concat(text.Select(c => (char)ToKatakana(c)));
|
||||||
|
}
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Common;
|
||||||
|
|
||||||
|
public static class CommonUtils
|
||||||
|
{
|
||||||
|
public static bool IsWhitespace(int codePoint) =>
|
||||||
|
codePoint == 0x0009 /* \t */ ||
|
||||||
|
codePoint == 0x000A /* \n */ ||
|
||||||
|
codePoint == 0x000B /* Vertical Tab */ ||
|
||||||
|
codePoint == 0x000C /* \f */ ||
|
||||||
|
codePoint == 0x000D /* \r */ ||
|
||||||
|
codePoint == 0x0020 /* Space */ ||
|
||||||
|
codePoint == 0x0085 /* Next Line (NEL) */ ||
|
||||||
|
codePoint == 0x00A0 /* No-Break Space */ ||
|
||||||
|
codePoint == 0x1680 /* Ogham Space Mark */ ||
|
||||||
|
codePoint >= 0x2000 && codePoint <= 0x200A ||
|
||||||
|
codePoint == 0x2028 /* Line Separator */ ||
|
||||||
|
codePoint == 0x2029 /* Paragraph Separator */ ||
|
||||||
|
codePoint == 0x202F /* Narrow No-Break Space */ ||
|
||||||
|
codePoint == 0x205F /* Medium Mathematical Space */ ||
|
||||||
|
codePoint == 0x3000 /* Ideographic Space */;
|
||||||
|
}
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
using System.Text;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
|
||||||
|
public static class UnicodeExtensions
|
||||||
|
{
|
||||||
|
public static IEnumerable<int> ToCodePoints(this string s)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < s.Length; i++)
|
||||||
|
{
|
||||||
|
int codePoint = char.ConvertToUtf32(s, i);
|
||||||
|
if (codePoint > 0xffff) i++;
|
||||||
|
yield return codePoint;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static StringBuilder ToUtf32StringBuilder(this IEnumerable<int> codePoints)
|
||||||
|
{
|
||||||
|
var sb = new StringBuilder();
|
||||||
|
foreach (var codePoint in codePoints) sb.Append(char.ConvertFromUtf32(codePoint));
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string ToUtf32String(this IEnumerable<int> codePoints) => ToUtf32StringBuilder(codePoints).ToString();
|
||||||
|
}
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<OutputType>Library</OutputType>
|
||||||
|
<RootNamespace>$(ProjectName).Common</RootNamespace>
|
||||||
|
<AssemblyName>$(RootNamespace)</AssemblyName>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageId>$(RootNamespace)</PackageId>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="DotNetCampus.LatestCSharpFeatures" PrivateAssets="all" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Common;
|
||||||
|
|
||||||
|
public class TrieNode
|
||||||
|
{
|
||||||
|
public required TrieNode? Parent { get; set; }
|
||||||
|
public required Dictionary<int, TrieNode> Children { get; set; } // Unicode code point -> child node
|
||||||
|
public required List<int> TokenIds { get; set; }
|
||||||
|
public required List<int> SubTreeTokenIds { get; set; } // Empty on root.
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class TrieNodeExtensions
|
||||||
|
{
|
||||||
|
public static TrieNode? TraverseStep(this TrieNode? node, int codePoint, bool isIgnorable = false) =>
|
||||||
|
(node?.Children.TryGetValue(codePoint, out var child) ?? false)
|
||||||
|
? child
|
||||||
|
: isIgnorable ? node : null;
|
||||||
|
|
||||||
|
public static TrieNode? Traverse(this TrieNode? node, int[] codePoints, bool isIgnorable = false)
|
||||||
|
{
|
||||||
|
if (node == null) return null;
|
||||||
|
foreach (var codePoint in codePoints)
|
||||||
|
{
|
||||||
|
node = node?.TraverseStep(codePoint, isIgnorable);
|
||||||
|
if (node == null) return null;
|
||||||
|
}
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<int> GetTokenIds(this TrieNode? node, bool includeSubTree = false) =>
|
||||||
|
(includeSubTree ? node?.SubTreeTokenIds : node?.TokenIds) ?? [];
|
||||||
|
|
||||||
|
public static bool IsTokenExactMatch(this TrieNode? node, int tokenId) => node?.TokenIds.Contains(tokenId) ?? false;
|
||||||
|
}
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Common.Types;
|
||||||
|
|
||||||
|
#pragma warning disable IDE1006 // Naming rule violation
|
||||||
|
|
||||||
|
// For compatibility with TypeScript, we use camelCase property names here.
|
||||||
|
|
||||||
|
public class CompressedInvertedIndex
|
||||||
|
{
|
||||||
|
public required string[] documents { get; set; }
|
||||||
|
public required int[] tokenTypes { get; set; } // Use int values here instead of TokenType enum to avoid JSON serialization issues.
|
||||||
|
public required List<int[]>[] tokenReferences { get; set; } // tokenId -> [documentId, start1, end1, start2, end2, ...]
|
||||||
|
public required CompressedInvertedIndexTries tries { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public class CompressedInvertedIndexTries
|
||||||
|
{
|
||||||
|
public required int[] romaji { get; set; }
|
||||||
|
public required int[] kana { get; set; }
|
||||||
|
public required int[] other { get; set; }
|
||||||
|
}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Common.Types;
|
||||||
|
|
||||||
|
public class OffsetSpan
|
||||||
|
{
|
||||||
|
public required int Start { get; init; }
|
||||||
|
public required int End { get; init; }
|
||||||
|
|
||||||
|
public int Length => End - Start;
|
||||||
|
}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Common.Types;
|
||||||
|
|
||||||
|
public class TokenDefinition
|
||||||
|
{
|
||||||
|
public required int Id { get; set; }
|
||||||
|
public required TokenType Type { get; set; }
|
||||||
|
public required string Text { get; set; }
|
||||||
|
public required int CodePointLength { get; set; }
|
||||||
|
}
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Common.Types;
|
||||||
|
|
||||||
|
public enum TokenType
|
||||||
|
{
|
||||||
|
Raw,
|
||||||
|
Kana,
|
||||||
|
Romaji,
|
||||||
|
Han,
|
||||||
|
Pinyin,
|
||||||
|
}
|
||||||
@@ -0,0 +1,80 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using OpenccNetLib;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
|
||||||
|
public class HanVariantProvider
|
||||||
|
{
|
||||||
|
private readonly Dictionary<int, int[]> EXCHANGE_MAP;
|
||||||
|
public HanVariantProvider(DictWithMaxLength[]? dicts = null)
|
||||||
|
{
|
||||||
|
dicts ??=
|
||||||
|
[
|
||||||
|
DictionaryLib.Provider.hk_variants,
|
||||||
|
DictionaryLib.Provider.hk_variants_rev,
|
||||||
|
DictionaryLib.Provider.jp_variants,
|
||||||
|
DictionaryLib.Provider.jp_variants_rev,
|
||||||
|
DictionaryLib.Provider.st_characters,
|
||||||
|
DictionaryLib.Provider.ts_characters,
|
||||||
|
DictionaryLib.Provider.tw_variants,
|
||||||
|
DictionaryLib.Provider.tw_variants_rev,
|
||||||
|
];
|
||||||
|
EXCHANGE_MAP = BuildHanExchangeMap(dicts);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Dictionary<int, int[]> BuildHanExchangeMap(DictWithMaxLength[] dicts)
|
||||||
|
{
|
||||||
|
var unionFindSet = new UnionFindSet();
|
||||||
|
foreach (var dict in dicts) foreach (var item in dict.Dict)
|
||||||
|
{
|
||||||
|
var from = item.Key.ToCodePoints().ToArray();
|
||||||
|
var to = item.Value.ToCodePoints().ToArray();
|
||||||
|
if (from.Length != 1 || to.Length != 1) continue;
|
||||||
|
unionFindSet.Union(from[0], to[0]);
|
||||||
|
}
|
||||||
|
var variants = new Dictionary<int, List<int>>();
|
||||||
|
foreach (var x in unionFindSet.Keys)
|
||||||
|
{
|
||||||
|
var parent = unionFindSet.Find(x);
|
||||||
|
if (!variants.TryGetValue(parent, out var list)) variants[parent] = list = [];
|
||||||
|
if (x != parent) variants[x] = list;
|
||||||
|
list.Add(x);
|
||||||
|
}
|
||||||
|
return variants.ToDictionary(item => item.Key, item => item.Value.OrderBy(x => x).ToArray());
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://github.com/google/re2/blob/e7aec5985072c1dbe735add802653ef4b36c231a/re2/unicode_groups.cc#L5590-L5615
|
||||||
|
private static readonly (int Min, int Max)[] RE2_SCRIPT_HAN_RENAGES =
|
||||||
|
[
|
||||||
|
// Han_range16
|
||||||
|
(11904, 11929),
|
||||||
|
(11931, 12019),
|
||||||
|
(12032, 12245),
|
||||||
|
(12293, 12293),
|
||||||
|
(12295, 12295),
|
||||||
|
(12321, 12329),
|
||||||
|
(12344, 12347),
|
||||||
|
(13312, 19903),
|
||||||
|
(19968, 40959),
|
||||||
|
(63744, 64109),
|
||||||
|
(64112, 64217),
|
||||||
|
// Han_range32
|
||||||
|
(94178, 94179),
|
||||||
|
(94192, 94193),
|
||||||
|
(131072, 173791),
|
||||||
|
(173824, 177977),
|
||||||
|
(177984, 178205),
|
||||||
|
(178208, 183969),
|
||||||
|
(183984, 191456),
|
||||||
|
(191472, 192093),
|
||||||
|
(194560, 195101),
|
||||||
|
(196608, 201546),
|
||||||
|
(201552, 205743),
|
||||||
|
];
|
||||||
|
|
||||||
|
public static bool IsHanCharacter(int codePoint) => RE2_SCRIPT_HAN_RENAGES.Any(range => codePoint >= range.Min && codePoint <= range.Max);
|
||||||
|
|
||||||
|
public int[] GetHanVariants(int codePoint) => EXCHANGE_MAP.TryGetValue(codePoint, out var variants)
|
||||||
|
? variants
|
||||||
|
: IsHanCharacter(codePoint) ? [codePoint] : [];
|
||||||
|
}
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
using hyjiacan.py4n;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
|
||||||
|
public static class PinyinHelper
|
||||||
|
{
|
||||||
|
private static readonly string[] PINYIN_INITIALS = ["b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"];
|
||||||
|
private static readonly Dictionary<string, string> PINYIN_FINALS_FUZZY_MAP = new() { ["ang"] = "an", ["eng"] = "en", ["ing"] = "in" };
|
||||||
|
|
||||||
|
public static IEnumerable<string> GetPinyinCandidates(int codePoint) => codePoint < char.MinValue || codePoint > char.MaxValue || !PinyinUtil.IsHanzi((char)codePoint) ? [] :
|
||||||
|
Pinyin4Net.GetPinyin((char)codePoint, PinyinFormat.LOWERCASE | PinyinFormat.WITHOUT_TONE).Where(pinyin => pinyin.Length > 0).SelectMany(pinyin =>
|
||||||
|
{
|
||||||
|
var initial = PINYIN_INITIALS.FirstOrDefault(initial => pinyin.StartsWith(initial));
|
||||||
|
var initialAlphabet = initial != null ? initial[..1] : pinyin[..1];
|
||||||
|
var fuzzySuffix = pinyin.Length < 3 ? null : pinyin[^3..];
|
||||||
|
var fuzzyPinyin = fuzzySuffix != null && PINYIN_FINALS_FUZZY_MAP.TryGetValue(fuzzySuffix, out var fuzzySuffixTarget) ? pinyin[..^3] + fuzzySuffixTarget : null;
|
||||||
|
return new string?[] { pinyin, initial, initialAlphabet, fuzzyPinyin }.OfType<string>();
|
||||||
|
}).Distinct();
|
||||||
|
}
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
namespace MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
|
||||||
|
public class UnionFindSet
|
||||||
|
{
|
||||||
|
private Dictionary<int, int> Parent { get; set; } = [];
|
||||||
|
private Dictionary<int, int> Rank { get; set; } = [];
|
||||||
|
|
||||||
|
public IEnumerable<int> Keys => Parent.Keys;
|
||||||
|
|
||||||
|
public int Find(int x)
|
||||||
|
{
|
||||||
|
if (!Parent.TryGetValue(x, out var parent)) return Parent[x] = x;
|
||||||
|
else if (x == parent) return x;
|
||||||
|
else return Parent[x] = Find(parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Union(int x, int y)
|
||||||
|
{
|
||||||
|
x = Find(x);
|
||||||
|
y = Find(y);
|
||||||
|
if (x == y) return;
|
||||||
|
int rankX = GetRank(x), rankY = GetRank(y);
|
||||||
|
if (rankX < rankY) Parent[x] = y;
|
||||||
|
else if (rankX > rankY) Parent[y] = x;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Parent[y] = x;
|
||||||
|
Rank[x] = rankX + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int GetRank(int x) => !Rank.TryGetValue(x, out var rank) ? 0 : rank;
|
||||||
|
}
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Common.Types;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Trie;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer;
|
||||||
|
|
||||||
|
public static class InvertedIndexBuilder
|
||||||
|
{
|
||||||
|
private static TrieNode BuildTypedTrie(IEnumerable<TokenDefinition> tokenDefinitions, Func<TokenType, bool> typePredicate) =>
|
||||||
|
TrieBuilder.BuildTrie(tokenDefinitions
|
||||||
|
.Where(token => typePredicate(token.Type))
|
||||||
|
.Select(token => (token.Id, CodePoints: token.Text.ToCodePoints())));
|
||||||
|
|
||||||
|
public static CompressedInvertedIndex BuildInvertedIndex(string[] documents, TokenizerOptions? tokenizerOptions = null)
|
||||||
|
{
|
||||||
|
var tokenizer = new Tokenizer(tokenizerOptions);
|
||||||
|
var documentTokens = documents.Select(tokenizer.Tokenize).ToArray();
|
||||||
|
|
||||||
|
var tokenDefinitions = tokenizer.Tokens.Values;
|
||||||
|
var romajiRoot = BuildTypedTrie(tokenDefinitions, type => type == TokenType.Romaji);
|
||||||
|
var kanaRoot = BuildTypedTrie(tokenDefinitions, type => type == TokenType.Kana);
|
||||||
|
var otherRoot = BuildTypedTrie(tokenDefinitions, type => type != TokenType.Romaji && type != TokenType.Kana);
|
||||||
|
TrieBuilder.GraftTriePaths(romajiRoot, JapaneseNormalization.NORMALIZE_RULES_ROMAJI_CODEPOINTS);
|
||||||
|
TrieBuilder.GraftTriePaths(kanaRoot, JapaneseNormalization.NORMALIZE_RULES_KANA_DAKUTEN_CODEPOINTS);
|
||||||
|
|
||||||
|
var invertedIndex = new CompressedInvertedIndex
|
||||||
|
{
|
||||||
|
documents = documents,
|
||||||
|
tokenTypes = [.. tokenDefinitions.Select(token => (int)token.Type)],
|
||||||
|
tokenReferences = [.. tokenDefinitions.Select(_ => new List<int[]>())],
|
||||||
|
tries = new CompressedInvertedIndexTries
|
||||||
|
{
|
||||||
|
romaji = TrieSerializer.Serialize(romajiRoot),
|
||||||
|
kana = TrieSerializer.Serialize(kanaRoot),
|
||||||
|
other = TrieSerializer.Serialize(otherRoot),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
for (var documentId = 0; documentId < documents.Length; documentId++)
|
||||||
|
{
|
||||||
|
var tokens = documentTokens[documentId];
|
||||||
|
var tokenOccurrences = new Dictionary<int, List<int>>();
|
||||||
|
foreach (var token in tokens)
|
||||||
|
{
|
||||||
|
if (!tokenOccurrences.TryGetValue(token.Id, out var occurrences)) tokenOccurrences[token.Id] = occurrences = [];
|
||||||
|
occurrences.Add(token.Start);
|
||||||
|
occurrences.Add(token.End);
|
||||||
|
}
|
||||||
|
foreach (var (tokenId, occurrences) in tokenOccurrences)
|
||||||
|
{
|
||||||
|
invertedIndex.tokenReferences[tokenId].Add([documentId, .. occurrences]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return invertedIndex;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
|
||||||
|
public static class JapaneseNormalization
|
||||||
|
{
|
||||||
|
public delegate string Normalizer(string text);
|
||||||
|
|
||||||
|
public static Normalizer CreateNormalizer(Dictionary<string, string> rules) => text =>
|
||||||
|
{
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
var beforeCurrentIteration = text;
|
||||||
|
foreach (var (from, to) in rules) text = text.Replace(from, to);
|
||||||
|
if (text == beforeCurrentIteration) break;
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
};
|
||||||
|
|
||||||
|
public static IEnumerable<(int[] From, int[] To)> ToCodePointPairs(Dictionary<string, string> rules) =>
|
||||||
|
rules.Select(rule => (From: rule.Key.ToCodePoints().ToArray(), To: rule.Value.ToCodePoints().ToArray()));
|
||||||
|
|
||||||
|
public static readonly Dictionary<string, string> NORMALIZE_RULES_ROMAJI = new()
|
||||||
|
{
|
||||||
|
// Remove all long vowels (sa-ba- -> saba)
|
||||||
|
["-"] = "",
|
||||||
|
// Collapse consecutive vowels
|
||||||
|
["aa"] = "a",
|
||||||
|
["ii"] = "i",
|
||||||
|
["uu"] = "u",
|
||||||
|
["ee"] = "e",
|
||||||
|
["oo"] = "o",
|
||||||
|
["ou"] = "o",
|
||||||
|
// mb/mp/mm -> nb/np/nm (shimbun -> shinbun)
|
||||||
|
["mb"] = "nb",
|
||||||
|
["mp"] = "np",
|
||||||
|
["mm"] = "nm",
|
||||||
|
// Others
|
||||||
|
["sha"] = "sya",
|
||||||
|
["tsu"] = "tu",
|
||||||
|
["chi"] = "ti",
|
||||||
|
["shi"] = "si",
|
||||||
|
["ji"] = "zi",
|
||||||
|
};
|
||||||
|
public static readonly IEnumerable<(int[] From, int[] To)> NORMALIZE_RULES_ROMAJI_CODEPOINTS = ToCodePointPairs(NORMALIZE_RULES_ROMAJI);
|
||||||
|
public static readonly Normalizer NormalizeRomaji = CreateNormalizer(NORMALIZE_RULES_ROMAJI);
|
||||||
|
|
||||||
|
public static readonly Dictionary<string, string> NORMALIZE_RULES_KANA_DAKUTEN = new()
|
||||||
|
{
|
||||||
|
["う\u3099"] = "ゔ",
|
||||||
|
["か\u3099"] = "が", ["き\u3099"] = "ぎ", ["く\u3099"] = "ぐ", ["け\u3099"] = "げ", ["こ\u3099"] = "ご",
|
||||||
|
["さ\u3099"] = "ざ", ["し\u3099"] = "じ", ["す\u3099"] = "ず", ["せ\u3099"] = "ぜ", ["そ\u3099"] = "ぞ",
|
||||||
|
["た\u3099"] = "だ", ["ち\u3099"] = "ぢ", ["つ\u3099"] = "づ", ["て\u3099"] = "で", ["と\u3099"] = "ど",
|
||||||
|
["は\u3099"] = "ば", ["ひ\u3099"] = "び", ["ふ\u3099"] = "ぶ", ["へ\u3099"] = "べ", ["ほ\u3099"] = "ぼ",
|
||||||
|
["は\u309A"] = "ぱ", ["ひ\u309A"] = "ぴ", ["ふ\u309A"] = "ぷ", ["へ\u309A"] = "ぺ", ["ほ\u309A"] = "ぽ",
|
||||||
|
["ゝ\u3099"] = "ゞ",
|
||||||
|
|
||||||
|
["ウ\u3099"] = "ヴ",
|
||||||
|
["カ\u3099"] = "ガ", ["キ\u3099"] = "ギ", ["ク\u3099"] = "グ", ["ケ\u3099"] = "ゲ", ["コ\u3099"] = "ゴ",
|
||||||
|
["サ\u3099"] = "ザ", ["シ\u3099"] = "ジ", ["ス\u3099"] = "ズ", ["セ\u3099"] = "ゼ", ["ソ\u3099"] = "ゾ",
|
||||||
|
["タ\u3099"] = "ダ", ["チ\u3099"] = "ヂ", ["ツ\u3099"] = "ヅ", ["テ\u3099"] = "デ", ["ト\u3099"] = "ド",
|
||||||
|
["ハ\u3099"] = "バ", ["ヒ\u3099"] = "ビ", ["フ\u3099"] = "ブ", ["ヘ\u3099"] = "ベ", ["ホ\u3099"] = "ボ",
|
||||||
|
["ハ\u309A"] = "パ", ["ヒ\u309A"] = "ピ", ["フ\u309A"] = "プ", ["ヘ\u309A"] = "ペ", ["ホ\u309A"] = "ポ",
|
||||||
|
["ワ\u3099"] = "ヷ", ["ヰ\u3099"] = "ヸ", ["ヱ\u3099"] = "ヹ", ["ヲ\u3099"] = "ヺ",
|
||||||
|
["ヽ\u3099"] = "ヾ",
|
||||||
|
};
|
||||||
|
public static readonly IEnumerable<(int[] From, int[] To)> NORMALIZE_RULES_KANA_DAKUTEN_CODEPOINTS = ToCodePointPairs(NORMALIZE_RULES_KANA_DAKUTEN);
|
||||||
|
public static readonly Normalizer NormalizeKanaDakuten = CreateNormalizer(NORMALIZE_RULES_KANA_DAKUTEN);
|
||||||
|
}
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
using MyNihongo.KanaConverter;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
|
||||||
|
public static class JapaneseUtils
|
||||||
|
{
|
||||||
|
public static bool IsMaybeJapanese(int codePoint) =>
|
||||||
|
HanVariantProvider.IsHanCharacter(codePoint) ||
|
||||||
|
IsKana(codePoint) ||
|
||||||
|
IsJapaneseSoundMark(codePoint) ||
|
||||||
|
codePoint == 0x3005 || codePoint == 0x3006 || codePoint == 0x30FC;
|
||||||
|
|
||||||
|
// See also Common/Normalization.cs
|
||||||
|
public static bool IsJapaneseSoundMark(int codePoint) => codePoint == 0x3099 || codePoint == 0x309A;
|
||||||
|
public static string StripJapaneseSoundMarks(string text) => string.Concat(text.Where(codePoint => !IsJapaneseSoundMark(codePoint)));
|
||||||
|
|
||||||
|
public static bool IsKana(int codePoint) => (codePoint >= 0x3041 && codePoint <= 0x309F) || (codePoint >= 0x30A0 && codePoint <= 0x30FF);
|
||||||
|
|
||||||
|
private static readonly int[] KANAS_CANNOT_BE_FIRST =
|
||||||
|
[
|
||||||
|
'ァ', 'ィ', 'ゥ', 'ェ', 'ォ',
|
||||||
|
'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ',
|
||||||
|
'ャ', 'ュ', 'ョ',
|
||||||
|
'ゃ', 'ゅ', 'ょ',
|
||||||
|
'ヮ', 'ゎ',
|
||||||
|
'ㇰ', 'ㇱ', 'ㇲ', 'ㇳ', 'ㇴ', 'ㇵ', 'ㇶ', 'ㇷ', 'ㇸ', 'ㇹ', 'ㇺ', 'ㇻ', 'ㇼ', 'ㇽ', 'ㇾ', 'ㇿ',
|
||||||
|
'ー',
|
||||||
|
];
|
||||||
|
|
||||||
|
private static readonly int[] KANAS_CANNOT_BE_LAST =
|
||||||
|
[
|
||||||
|
'ッ', 'っ'
|
||||||
|
];
|
||||||
|
|
||||||
|
public static string ToRomajiStrictly(string kanaText)
|
||||||
|
{
|
||||||
|
if (kanaText.Length == 0) return "";
|
||||||
|
if (KANAS_CANNOT_BE_FIRST.Contains(kanaText[0])) return "";
|
||||||
|
if (KANAS_CANNOT_BE_LAST.Contains(kanaText[^1])) return "";
|
||||||
|
string romaji;
|
||||||
|
try { romaji = kanaText.ToRomaji(); }
|
||||||
|
catch { return ""; }
|
||||||
|
if (!romaji.All(c => c is >= 'a' and <= 'z')) return "";
|
||||||
|
return romaji;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static bool IsValidJapanesePhrase(ReadOnlySpan<int> codePoints, int start, int length) =>
|
||||||
|
// Skip splittings that cause sound marks to occur in the first position of a phrase
|
||||||
|
!IsJapaneseSoundMark(codePoints[start]) && (start + length == codePoints.Length || !IsJapaneseSoundMark(codePoints[start + length]));
|
||||||
|
public static bool IsValidJapanesePhrase(ReadOnlyMemory<int> codePoints, int start, int length) => IsValidJapanesePhrase(codePoints.Span, start, length);
|
||||||
|
}
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
using System.Runtime.InteropServices;
|
||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MeCab;
|
||||||
|
using MeCab.Core;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
|
||||||
|
public class Transcription
|
||||||
|
{
|
||||||
|
public required int Start { get; set; }
|
||||||
|
public required int Length { get; set; }
|
||||||
|
public required string[] Transcriptions { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public delegate IEnumerable<Transcription> TranscriptionEnumerator(ReadOnlyMemory<int> codePoints);
|
||||||
|
public delegate bool IsValidPhraseDelegate(ReadOnlyMemory<int> codePoints, int start, int length);
|
||||||
|
public delegate HashSet<string> GetAllTranscriptionsDelegate(string phrase);
|
||||||
|
|
||||||
|
public class TranscriptionProvider
|
||||||
|
{
|
||||||
|
public MeCabDictionary[] Dictionaries { get; set; }
|
||||||
|
|
||||||
|
public TranscriptionProvider(MeCabDictionary[]? dictionaries = null)
|
||||||
|
{
|
||||||
|
if (dictionaries == null)
|
||||||
|
{
|
||||||
|
var param = new MeCabParam();
|
||||||
|
param.LoadDicRC();
|
||||||
|
var dictionary = new MeCabDictionary();
|
||||||
|
dictionary.Open(Path.Combine(param.DicDir, "sys.dic"));
|
||||||
|
dictionaries = [dictionary];
|
||||||
|
}
|
||||||
|
Dictionaries = dictionaries;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TranscriptionEnumerator CreateTranscriptionEnumerator(IsValidPhraseDelegate isValidPhrase, GetAllTranscriptionsDelegate getAllTranscriptions) => codePoints =>
|
||||||
|
{
|
||||||
|
var resultMap = new Dictionary<(int Start, int Length), Transcription>();
|
||||||
|
for (int phraseLength = 1; phraseLength <= codePoints.Length; phraseLength++) for (int start = 0; start + phraseLength <= codePoints.Length; start++)
|
||||||
|
{
|
||||||
|
if (!isValidPhrase(codePoints, start, phraseLength)) continue;
|
||||||
|
var phrase = MemoryMarshal.ToEnumerable(codePoints.Slice(start, phraseLength)).ToUtf32String();
|
||||||
|
var atomicTranscriptions = getAllTranscriptions(phrase).Where(transcription => transcription != null).Where(candidateTranscription =>
|
||||||
|
{
|
||||||
|
if (candidateTranscription.Length == 0) return false;
|
||||||
|
// Ensure the transcription is atomic (not a combination of multiple shorter transcriptions, separated by any midpoints)
|
||||||
|
var visitedStates = new HashSet<(int PhrasePosition, int TranscriptionPosition)>();
|
||||||
|
var queue = new Queue<(int PhrasePosition, int TranscriptionPosition)>();
|
||||||
|
queue.Enqueue((0, 0));
|
||||||
|
while (queue.Count > 0)
|
||||||
|
{
|
||||||
|
var (phrasePosition, transcriptionPosition) = queue.Dequeue();
|
||||||
|
for (int prefixLength = 1; prefixLength <= phraseLength - phrasePosition; prefixLength++)
|
||||||
|
{
|
||||||
|
if (!resultMap.TryGetValue((start + phrasePosition, prefixLength), out var prefixResult)) continue;
|
||||||
|
foreach (var transcription in prefixResult.Transcriptions) if (string.Compare(candidateTranscription, transcriptionPosition, transcription, 0, transcription.Length) == 0)
|
||||||
|
{
|
||||||
|
var nextState = (PhrasePosition: phrasePosition + prefixLength, TranscriptionPosition: transcriptionPosition + transcription.Length);
|
||||||
|
if (nextState.PhrasePosition == phraseLength && nextState.TranscriptionPosition == candidateTranscription.Length) return false; // Found a valid combination
|
||||||
|
if (visitedStates.Contains(nextState)) continue;
|
||||||
|
visitedStates.Add(nextState);
|
||||||
|
queue.Enqueue(nextState);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}).ToArray();
|
||||||
|
if (atomicTranscriptions.Length > 0) resultMap[(start, phraseLength)] = new() { Start = start, Length = phraseLength, Transcriptions = atomicTranscriptions };
|
||||||
|
}
|
||||||
|
return resultMap.Values;
|
||||||
|
};
|
||||||
|
|
||||||
|
public HashSet<string> GetAllKanaReadings(string phrase)
|
||||||
|
{
|
||||||
|
var result = new HashSet<string>();
|
||||||
|
var isKana = phrase.All(ch => JapaneseUtils.IsKana(ch));
|
||||||
|
if (isKana) result.Add(CommonNormalization.ToKatakana(phrase));
|
||||||
|
if (isKana && phrase.Length == 1) return result;
|
||||||
|
|
||||||
|
foreach (var dictionary in Dictionaries)
|
||||||
|
{
|
||||||
|
var searchResult = dictionary.ExactMatchSearch(phrase);
|
||||||
|
if (searchResult.Value == -1) continue;
|
||||||
|
var tokens = dictionary.GetToken(searchResult);
|
||||||
|
foreach (var token in tokens)
|
||||||
|
{
|
||||||
|
var feature = dictionary.GetFeature(token.Feature);
|
||||||
|
var parts = feature.Split(',');
|
||||||
|
if (parts.Length > 7) result.Add(CommonNormalization.ToKatakana(parts[7]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HashSet<string> GetAllKanaReadingsWithNormalization(string phrase) =>
|
||||||
|
GetAllKanaReadings(JapaneseUtils.StripJapaneseSoundMarks(JapaneseNormalization.NormalizeKanaDakuten(phrase)));
|
||||||
|
|
||||||
|
public TranscriptionEnumerator EnumerateKanaTranscriptions => CreateTranscriptionEnumerator(
|
||||||
|
JapaneseUtils.IsValidJapanesePhrase,
|
||||||
|
GetAllKanaReadingsWithNormalization);
|
||||||
|
public TranscriptionEnumerator EnumerateRomajiTranscriptions => CreateTranscriptionEnumerator(
|
||||||
|
JapaneseUtils.IsValidJapanesePhrase,
|
||||||
|
phrase => [.. GetAllKanaReadingsWithNormalization(phrase).Select(kana => JapaneseNormalization.NormalizeRomaji(JapaneseUtils.ToRomajiStrictly(kana)))]);
|
||||||
|
}
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<OutputType>Library</OutputType>
|
||||||
|
<RootNamespace>$(ProjectName).Indexer</RootNamespace>
|
||||||
|
<AssemblyName>$(RootNamespace)</AssemblyName>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageId>$(RootNamespace)</PackageId>
|
||||||
|
<!-- Don't include MeCab dictionaries in this package; let MeCab.DotNet provide them to end users -->
|
||||||
|
<MeCabUseDefaultDictionary>False</MeCabUseDefaultDictionary>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Common\MaigoLabs.NeedLe.Common.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="DotNetCampus.LatestCSharpFeatures" PrivateAssets="all" />
|
||||||
|
<PackageReference Include="hyjiacan.pinyin4net" />
|
||||||
|
<PackageReference Include="MeCab.DotNet" PrivateAssets="analyzers" />
|
||||||
|
<PackageReference Include="MyNihongo.KanaConverter" />
|
||||||
|
<PackageReference Include="OpenccNetLib" PrivateAssets="analyzers" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,104 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Common.Types;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer;
|
||||||
|
|
||||||
|
public class TokenizerOptions
|
||||||
|
{
|
||||||
|
public HanVariantProvider? HanVariantProvider { get; set; }
|
||||||
|
public TranscriptionProvider? TranscriptionProvider { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Tokenizer(TokenizerOptions? options = null)
|
||||||
|
{
|
||||||
|
public HanVariantProvider HanVariantProvider { get; set; } = options?.HanVariantProvider ?? new HanVariantProvider();
|
||||||
|
public TranscriptionProvider TranscriptionProvider { get; set; } = options?.TranscriptionProvider ?? new TranscriptionProvider();
|
||||||
|
|
||||||
|
public class Token
|
||||||
|
{
|
||||||
|
public required int Id { get; set; }
|
||||||
|
public required int Start { get; set; }
|
||||||
|
public required int End { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public Dictionary<(TokenType Type, string Text), TokenDefinition> Tokens { get; } = [];
|
||||||
|
private TokenDefinition EnsureToken(TokenType type, string text)
|
||||||
|
{
|
||||||
|
var key = (type, text);
|
||||||
|
if (Tokens.TryGetValue(key, out var tokenDefinition)) return tokenDefinition;
|
||||||
|
tokenDefinition = new TokenDefinition { Id = Tokens.Count, Type = type, Text = text, CodePointLength = text.ToCodePoints().Count() };
|
||||||
|
Tokens.Add(key, tokenDefinition);
|
||||||
|
return tokenDefinition;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Token> Tokenize(string text)
|
||||||
|
{
|
||||||
|
var codePoints = text.ToCodePoints().Select(CommonNormalization.NormalizeCodePoint).ToArray();
|
||||||
|
var results = new List<Token>();
|
||||||
|
Action<TokenType /* tokenType */, string /* text */> Emitter(int start, int end) =>
|
||||||
|
(tokenType, codePoints) => results.Add(new Token { Id = EnsureToken(tokenType, codePoints).Id, Start = start, End = end });
|
||||||
|
|
||||||
|
void EmitMaybeJapanese(ReadOnlyMemory<int> codePoints, int offset)
|
||||||
|
{
|
||||||
|
foreach (var combination in TranscriptionProvider.EnumerateKanaTranscriptions(codePoints))
|
||||||
|
{
|
||||||
|
var emit = Emitter(offset + combination.Start, offset + combination.Start + combination.Length);
|
||||||
|
foreach (var transcription in combination.Transcriptions) emit(TokenType.Kana, transcription);
|
||||||
|
}
|
||||||
|
foreach (var combination in TranscriptionProvider.EnumerateRomajiTranscriptions(codePoints))
|
||||||
|
{
|
||||||
|
var emit = Emitter(offset + combination.Start, offset + combination.Start + combination.Length);
|
||||||
|
foreach (var transcription in combination.Transcriptions) emit(TokenType.Romaji, transcription);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < codePoints.Length; i++)
|
||||||
|
{
|
||||||
|
// Single character may have not only kana readings, but also Chinese pronunciations or Simplified/Traditional/Japanese variants.
|
||||||
|
var hanAlternates = HanVariantProvider.GetHanVariants(codePoints.Span[i]); // All possible variant characters (Simplified/Traditional/Japanese)
|
||||||
|
var pinyinAlternates = hanAlternates.SelectMany(PinyinHelper.GetPinyinCandidates).Distinct();
|
||||||
|
var emit = Emitter(offset + i, offset + i + 1);
|
||||||
|
foreach (var han in hanAlternates) emit(TokenType.Han, char.ConvertFromUtf32(han));
|
||||||
|
foreach (var pinyin in pinyinAlternates) emit(TokenType.Pinyin, pinyin);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var consequentCharsets = new (Func<int, bool> Is, Action<ReadOnlyMemory<int>, int> Emit)[]
|
||||||
|
{
|
||||||
|
(Is: JapaneseUtils.IsMaybeJapanese, Emit: EmitMaybeJapanese),
|
||||||
|
};
|
||||||
|
|
||||||
|
void EmitRaw(int codePoint, int offset) => Emitter(offset, offset + 1)(TokenType.Raw, char.ConvertFromUtf32(codePoint));
|
||||||
|
|
||||||
|
for (int start = 0; start < codePoints.Length; )
|
||||||
|
{
|
||||||
|
var codePoint = codePoints[start];
|
||||||
|
var emitted = false;
|
||||||
|
foreach (var (Is, Emit) in consequentCharsets)
|
||||||
|
{
|
||||||
|
var length = 0;
|
||||||
|
while (start + length < codePoints.Length && Is(codePoints[start + length])) length++;
|
||||||
|
if (length > 0)
|
||||||
|
{
|
||||||
|
Emit(new Memory<int>(codePoints, start, length), start);
|
||||||
|
start += length;
|
||||||
|
emitted = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (emitted) continue;
|
||||||
|
|
||||||
|
// Skip whitespaces
|
||||||
|
if (CommonUtils.IsWhitespace(codePoint))
|
||||||
|
{
|
||||||
|
start++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
EmitRaw(codePoint, start);
|
||||||
|
start++;
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer.Trie;
|
||||||
|
|
||||||
|
public static class TrieBuilder
|
||||||
|
{
|
||||||
|
private static TrieNode NewNode(TrieNode? parent) => new() { Parent = parent, Children = [], TokenIds = [], SubTreeTokenIds = [] };
|
||||||
|
|
||||||
|
public static TrieNode BuildTrie(IEnumerable<(int Id, IEnumerable<int> CodePoints)> tokens)
|
||||||
|
{
|
||||||
|
var root = NewNode(null);
|
||||||
|
foreach (var (id, codePoints) in tokens)
|
||||||
|
{
|
||||||
|
var node = root;
|
||||||
|
foreach (var codePoint in codePoints)
|
||||||
|
{
|
||||||
|
node.Children.TryGetValue(codePoint, out var childNode);
|
||||||
|
if (childNode == null) node.Children[codePoint] = childNode = NewNode(node);
|
||||||
|
node = childNode;
|
||||||
|
node.SubTreeTokenIds.Add(id);
|
||||||
|
}
|
||||||
|
node.TokenIds.Add(id);
|
||||||
|
}
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void GraftTriePaths(TrieNode root, IEnumerable<(int[] From, int[] To)> rules)
|
||||||
|
{
|
||||||
|
foreach (var (inputPhrase, graftTo) in rules) if (graftTo.Length > inputPhrase.Length) throw new ArgumentException($"Graft rule {inputPhrase} -> {graftTo} maps to longer string and may cause infinite loop");
|
||||||
|
var visitedNodes = new HashSet<TrieNode>();
|
||||||
|
void GraftFromNode(TrieNode node, bool recursiveChildren)
|
||||||
|
{
|
||||||
|
if (!visitedNodes.Add(node)) return;
|
||||||
|
if (recursiveChildren) foreach (var child in node.Children.Values) GraftFromNode(child, true);
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
var nodesWithNewGraftedChildren = new Dictionary<TrieNode, /* depth from initial node */ int>();
|
||||||
|
foreach (var (inputPhrase, graftTo) in rules)
|
||||||
|
{
|
||||||
|
var targetNode = node.Traverse(graftTo);
|
||||||
|
if (targetNode == null) continue;
|
||||||
|
var graftedPath = new TrieNode[inputPhrase.Length - 1];
|
||||||
|
var isGrafted = false;
|
||||||
|
var currentNode = node;
|
||||||
|
for (var i = 0; i < inputPhrase.Length; i++)
|
||||||
|
{
|
||||||
|
var codePoint = inputPhrase[i];
|
||||||
|
currentNode.Children.TryGetValue(codePoint, out var childNode);
|
||||||
|
if (i == inputPhrase.Length - 1)
|
||||||
|
{
|
||||||
|
if (childNode != null)
|
||||||
|
{
|
||||||
|
if (childNode != targetNode) throw new ArgumentException($"Grafted path {inputPhrase} conflicts with existing path");
|
||||||
|
// Already grafted
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
currentNode.Children[codePoint] = childNode = targetNode;
|
||||||
|
isGrafted = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (childNode == null)
|
||||||
|
{
|
||||||
|
childNode = NewNode(currentNode);
|
||||||
|
childNode.SubTreeTokenIds = targetNode.SubTreeTokenIds;
|
||||||
|
currentNode.Children[codePoint] = childNode;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Part of another grafted path?
|
||||||
|
childNode.SubTreeTokenIds = new HashSet<int>(childNode.SubTreeTokenIds.Concat(targetNode.SubTreeTokenIds)).ToList();
|
||||||
|
}
|
||||||
|
graftedPath[i] = currentNode = childNode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isGrafted) for (var i = 0; i < graftedPath.Length; i++) nodesWithNewGraftedChildren[graftedPath[i]!] = i + 1;
|
||||||
|
}
|
||||||
|
if (nodesWithNewGraftedChildren.Count > 0)
|
||||||
|
{
|
||||||
|
// Re-check graft rules on the newly grafted path
|
||||||
|
// 1. No need to recursive other children (not on this path) since their children are not affected
|
||||||
|
// 2. No need to consider ancestors of this node since they're handled later (we run in DFS order)
|
||||||
|
var sortedNodes = nodesWithNewGraftedChildren.OrderByDescending(x => x.Value);
|
||||||
|
foreach (var (changedNode, _) in sortedNodes) GraftFromNode(changedNode, false);
|
||||||
|
}
|
||||||
|
else break; // No new grafts applied
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GraftFromNode(root, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Indexer.Trie;
|
||||||
|
|
||||||
|
public static class TrieSerializer
|
||||||
|
{
|
||||||
|
private class NodeEntry
|
||||||
|
{
|
||||||
|
public int Id { get; set; }
|
||||||
|
public bool Visited { get; set; }
|
||||||
|
public int[]? Data { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int[] Serialize(TrieNode root)
|
||||||
|
{
|
||||||
|
var nodeEntries = new Dictionary<TrieNode, NodeEntry>();
|
||||||
|
var currentId = 0;
|
||||||
|
NodeEntry GetNodeEntry(TrieNode node) => nodeEntries.TryGetValue(node, out var nodeEntry) ? nodeEntry :
|
||||||
|
nodeEntries[node] = new NodeEntry { Id = ++currentId, Visited = false, Data = null };
|
||||||
|
int SerializeNode(TrieNode node)
|
||||||
|
{
|
||||||
|
var entry = GetNodeEntry(node);
|
||||||
|
if (entry.Visited) return entry.Id;
|
||||||
|
entry.Visited = true;
|
||||||
|
var children = node.Children.Select(child => (CodePoint: child.Key, ChildId: SerializeNode(child.Value))).ToArray();
|
||||||
|
entry.Data =
|
||||||
|
[
|
||||||
|
node.Parent != null ? GetNodeEntry(node.Parent).Id : 0,
|
||||||
|
.. children.Select(child => child.CodePoint),
|
||||||
|
.. children.Select(child => child.ChildId),
|
||||||
|
// End of children list (<= 0 are not valid code points nor node IDs)
|
||||||
|
.. node.TokenIds.Count > 0
|
||||||
|
? node.TokenIds.Select(tokenId => -(tokenId + 1)) // Use the negative value of (tokenId + 1)
|
||||||
|
: [0], // End of children list, no token IDs (token IDs are encoded to negative values)
|
||||||
|
];
|
||||||
|
return entry.Id;
|
||||||
|
}
|
||||||
|
SerializeNode(root);
|
||||||
|
return nodeEntries.Values.OrderBy(entry => entry.Id).SelectMany(entry => entry.Data ?? []).ToArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>net10.0</TargetFramework>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<RootNamespace>$(ProjectName).Playground</RootNamespace>
|
||||||
|
<AssemblyName>$(RootNamespace)</AssemblyName>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="Telegram.Bot" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe\MaigoLabs.NeedLe.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,162 @@
|
|||||||
|
using System.Diagnostics;
|
||||||
|
using System.Text.Encodings.Web;
|
||||||
|
using System.Text.Json;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Indexer;
|
||||||
|
using MaigoLabs.NeedLe.Searcher;
|
||||||
|
using Telegram.Bot;
|
||||||
|
using Telegram.Bot.Polling;
|
||||||
|
using Telegram.Bot.Types;
|
||||||
|
using Telegram.Bot.Types.Enums;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Playground;
|
||||||
|
|
||||||
|
public class Program
|
||||||
|
{
|
||||||
|
private static LoadedInvertedIndex _invertedIndex = null!;
|
||||||
|
private static long _targetChatId;
|
||||||
|
|
||||||
|
public static async Task Main(string[] args)
|
||||||
|
{
|
||||||
|
var botToken = Environment.GetEnvironmentVariable("TELEGRAM_BOT_TOKEN")
|
||||||
|
?? throw new InvalidOperationException("Missing environment variable TELEGRAM_BOT_TOKEN");
|
||||||
|
var targetChatIdStr = Environment.GetEnvironmentVariable("TARGET_CHAT_ID")
|
||||||
|
?? throw new InvalidOperationException("Missing environment variable TARGET_CHAT_ID");
|
||||||
|
_targetChatId = long.Parse(targetChatIdStr);
|
||||||
|
|
||||||
|
// Build inverted index
|
||||||
|
var exampleDocuments = File.ReadAllLines("../../example.txt").Where(line => line.Length > 0).ToArray();
|
||||||
|
|
||||||
|
var startBuild = Stopwatch.GetTimestamp();
|
||||||
|
var compressed = InvertedIndexBuilder.BuildInvertedIndex(exampleDocuments);
|
||||||
|
var endBuild = Stopwatch.GetTimestamp();
|
||||||
|
Console.WriteLine($"Built inverted index in {Stopwatch.GetElapsedTime(startBuild, endBuild).TotalMilliseconds}ms");
|
||||||
|
|
||||||
|
var startLoad = Stopwatch.GetTimestamp();
|
||||||
|
_invertedIndex = InvertedIndexLoader.Load(compressed);
|
||||||
|
var endLoad = Stopwatch.GetTimestamp();
|
||||||
|
Console.WriteLine($"Loaded inverted index in {Stopwatch.GetElapsedTime(startLoad, endLoad).TotalMilliseconds}ms");
|
||||||
|
|
||||||
|
// Start bot
|
||||||
|
var bot = new TelegramBotClient(botToken);
|
||||||
|
var me = await bot.GetMe();
|
||||||
|
Console.WriteLine($"Bot logged in as {me.FirstName} (@{me.Username})");
|
||||||
|
|
||||||
|
using var cts = new CancellationTokenSource();
|
||||||
|
Console.CancelKeyPress += (_, e) => { e.Cancel = true; cts.Cancel(); };
|
||||||
|
|
||||||
|
bot.StartReceiving(
|
||||||
|
updateHandler: HandleUpdateAsync,
|
||||||
|
errorHandler: HandleErrorAsync,
|
||||||
|
receiverOptions: new ReceiverOptions { AllowedUpdates = [UpdateType.Message] },
|
||||||
|
cancellationToken: cts.Token
|
||||||
|
);
|
||||||
|
await Task.Delay(-1, cts.Token).ContinueWith(_ => { });
|
||||||
|
}
|
||||||
|
|
||||||
|
private static async Task HandleUpdateAsync(ITelegramBotClient bot, Update update, CancellationToken ct)
|
||||||
|
{
|
||||||
|
if (update.Message is not { Text: { } text, Chat.Id: var chatId, From: { } from }) return;
|
||||||
|
|
||||||
|
Console.WriteLine($"{chatId}:{from.Id} {JsonSerializer.Serialize(text, JsonSerializerOptions)}");
|
||||||
|
|
||||||
|
if (chatId != _targetChatId) return;
|
||||||
|
|
||||||
|
if (text.StartsWith("/needle "))
|
||||||
|
{
|
||||||
|
var query = text["/needle ".Length..];
|
||||||
|
var response = HandleNeedleCommand(query);
|
||||||
|
await bot.SendMessage(chatId, response, parseMode: ParseMode.Html, cancellationToken: ct);
|
||||||
|
}
|
||||||
|
else if (text.StartsWith("/tokenize "))
|
||||||
|
{
|
||||||
|
var query = text["/tokenize ".Length..];
|
||||||
|
var response = HandleTokenizeCommand(query);
|
||||||
|
await bot.SendMessage(chatId, response, parseMode: ParseMode.Html, cancellationToken: ct);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Task HandleErrorAsync(ITelegramBotClient bot, Exception exception, HandleErrorSource source, CancellationToken ct)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"Error: {exception.Message}");
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string HandleNeedleCommand(string query)
|
||||||
|
{
|
||||||
|
var startSearch = Stopwatch.GetTimestamp();
|
||||||
|
var results = InvertedIndexSearcher.Search(_invertedIndex, query);
|
||||||
|
var endSearch = Stopwatch.GetTimestamp();
|
||||||
|
var searchDuration = Stopwatch.GetElapsedTime(startSearch, endSearch).TotalMilliseconds.ToString("F3");
|
||||||
|
|
||||||
|
if (results.Length == 0)
|
||||||
|
return Codify($"No results found after {searchDuration}ms");
|
||||||
|
|
||||||
|
var showingResults = results.Take(5).ToArray();
|
||||||
|
return string.Join('\n',
|
||||||
|
[
|
||||||
|
Codify($"Search completed in {searchDuration}ms, showing {showingResults.Length}/{results.Length} results:\n"),
|
||||||
|
.. showingResults.Select(result => InspectSearchResult(result, true))
|
||||||
|
]).TrimEnd();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string HandleTokenizeCommand(string query)
|
||||||
|
{
|
||||||
|
var tokenizer = new Tokenizer();
|
||||||
|
var startTokenize = Stopwatch.GetTimestamp();
|
||||||
|
var tokens = tokenizer.Tokenize(query);
|
||||||
|
var tokenDefinitions = tokenizer.Tokens.Values.ToArray();
|
||||||
|
var endTokenize = Stopwatch.GetTimestamp();
|
||||||
|
var tokenizeDuration = Stopwatch.GetElapsedTime(startTokenize, endTokenize).TotalMilliseconds.ToString("F3");
|
||||||
|
if (tokens.Count == 0) return Codify($"No tokens emitted after {tokenizeDuration}ms");
|
||||||
|
|
||||||
|
var codePoints = query.ToCodePoints().ToArray();
|
||||||
|
var lines = new List<string>
|
||||||
|
{
|
||||||
|
$"Tokenization completed in {tokenizeDuration}ms, emitted {tokens.Count} tokens:"
|
||||||
|
};
|
||||||
|
foreach (var token in tokens)
|
||||||
|
{
|
||||||
|
var tokenDef = tokenDefinitions[token.Id];
|
||||||
|
var originalPhrase = codePoints.Skip(token.Start).Take(token.End - token.Start).ToUtf32String();
|
||||||
|
lines.Add($" {tokenDef.Type}: {JsonSerializer.Serialize(tokenDef.Text, JsonSerializerOptions)} <- {JsonSerializer.Serialize(originalPhrase, JsonSerializerOptions)} [{token.Start}, {token.End}]");
|
||||||
|
}
|
||||||
|
return Codify(string.Join('\n', lines));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string InspectSearchResult(SearchResult result, bool htmlHighlight)
|
||||||
|
{
|
||||||
|
var documentText = result.DocumentText;
|
||||||
|
var documentCodePoints = result.DocumentCodePoints;
|
||||||
|
var tokens = result.Tokens;
|
||||||
|
var rangeCount = result.RangeCount;
|
||||||
|
var matchRatio = result.MatchRatio;
|
||||||
|
var matchRatioLevel = result.MatchRatioLevel;
|
||||||
|
|
||||||
|
var resultText = htmlHighlight
|
||||||
|
? string.Join("", SearchResultHighlighter.Highlight(result).Select(part => !part.IsHighlighted ? EscapeHtml(part.Text) : $"<u><b>{EscapeHtml(part.Text)}</b></u>"))
|
||||||
|
: documentText;
|
||||||
|
var description = $" ({rangeCount} ranges, {Math.Round(matchRatio * 10000) / 10000} => L{matchRatioLevel})";
|
||||||
|
return string.Join('\n',
|
||||||
|
[
|
||||||
|
resultText + (htmlHighlight ? $"<code>{description}</code>" : description),
|
||||||
|
.. tokens.Select(token =>
|
||||||
|
{
|
||||||
|
var escapedTokenText = JsonSerializer.Serialize(token.Definition.Text, JsonSerializerOptions);
|
||||||
|
var escapedDocumentText = JsonSerializer.Serialize(documentCodePoints.Skip(token.DocumentOffset.Start).Take(token.DocumentOffset.Length).ToUtf32String(), JsonSerializerOptions);
|
||||||
|
if (htmlHighlight)
|
||||||
|
{
|
||||||
|
escapedTokenText = EscapeHtml(escapedTokenText);
|
||||||
|
escapedDocumentText = EscapeHtml(escapedDocumentText);
|
||||||
|
}
|
||||||
|
var line = $" {token.Definition.Type}: {escapedTokenText} -> {escapedDocumentText}" + (token.IsTokenPrefixMatching ? " (prefix match)" : "");
|
||||||
|
return htmlHighlight ? $"<code>{line}</code>" : line;
|
||||||
|
}),
|
||||||
|
"",
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string Codify(string text) => $"<code>{EscapeHtml(text)}</code>";
|
||||||
|
private static JsonSerializerOptions JsonSerializerOptions => new() { Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping };
|
||||||
|
private static string EscapeHtml(string text) => text.Replace("&", "&").Replace("<", "<").Replace(">", ">");
|
||||||
|
}
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Common.Types;
|
||||||
|
using MaigoLabs.NeedLe.Searcher.Trie;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Searcher;
|
||||||
|
|
||||||
|
public class LoadedInvertedIndex
|
||||||
|
{
|
||||||
|
public class TokenDocumentReference
|
||||||
|
{
|
||||||
|
public required int DocumentId { get; set; }
|
||||||
|
public required OffsetSpan[] Offsets { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public class TokenDefinitionExtended : TokenDefinition
|
||||||
|
{
|
||||||
|
public required TokenDocumentReference[] References { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public class TypedTries
|
||||||
|
{
|
||||||
|
public required TrieNode Romaji { get; set; }
|
||||||
|
public required TrieNode Kana { get; set; }
|
||||||
|
public required TrieNode Other { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public required string[] Documents { get; set; }
|
||||||
|
public required int[][] DocumentCodePoints { get; set; }
|
||||||
|
public required TokenDefinitionExtended[] TokenDefinitions { get; set; }
|
||||||
|
public required TypedTries Tries { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public class InvertedIndexLoader
|
||||||
|
{
|
||||||
|
public static LoadedInvertedIndex Load(CompressedInvertedIndex compressed)
|
||||||
|
{
|
||||||
|
var documents = compressed.documents;
|
||||||
|
var documentCodePoints = documents.Select(document => document.ToCodePoints().ToArray()).ToArray();
|
||||||
|
|
||||||
|
var romajiTrie = TrieDeserializer.Deserialize(compressed.tries.romaji);
|
||||||
|
var kanaTrie = TrieDeserializer.Deserialize(compressed.tries.kana);
|
||||||
|
var otherTrie = TrieDeserializer.Deserialize(compressed.tries.other);
|
||||||
|
|
||||||
|
var tokenCodePoints = romajiTrie.TokenCodePoints.Concat(kanaTrie.TokenCodePoints).Concat(otherTrie.TokenCodePoints)
|
||||||
|
.ToDictionary(entry => entry.Key, entry => entry.Value);
|
||||||
|
var tokenDefinitions = compressed.tokenTypes.Select((type, index) => new LoadedInvertedIndex.TokenDefinitionExtended
|
||||||
|
{
|
||||||
|
Id = index, Type = (TokenType)type, Text = tokenCodePoints[index].ToUtf32String(),
|
||||||
|
CodePointLength = tokenCodePoints[index].Length,
|
||||||
|
References = compressed.tokenReferences[index].Select(data => new LoadedInvertedIndex.TokenDocumentReference
|
||||||
|
{
|
||||||
|
DocumentId = data[0],
|
||||||
|
Offsets = Enumerable.Range(0, data.Length / 2)
|
||||||
|
.Select(i => new OffsetSpan { Start = data[i * 2 + 1], End = data[i * 2 + 2] }).ToArray(),
|
||||||
|
}).ToArray(),
|
||||||
|
}).ToArray();
|
||||||
|
|
||||||
|
return new LoadedInvertedIndex
|
||||||
|
{
|
||||||
|
Documents = documents,
|
||||||
|
DocumentCodePoints = documentCodePoints,
|
||||||
|
TokenDefinitions = tokenDefinitions,
|
||||||
|
Tries = new LoadedInvertedIndex.TypedTries
|
||||||
|
{
|
||||||
|
Romaji = romajiTrie.Root,
|
||||||
|
Kana = kanaTrie.Root,
|
||||||
|
Other = otherTrie.Root,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,270 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Common.Types;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Searcher;
|
||||||
|
|
||||||
|
public class SearchResultToken
|
||||||
|
{
|
||||||
|
public required TokenDefinition Definition { get; set; }
|
||||||
|
public required OffsetSpan DocumentOffset { get; set; }
|
||||||
|
public required OffsetSpan InputOffset { get; set; }
|
||||||
|
public required bool IsTokenPrefixMatching { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public class SearchResult
|
||||||
|
{
|
||||||
|
public required int DocumentId { get; set; }
|
||||||
|
public required string DocumentText { get; set; }
|
||||||
|
public required int[] DocumentCodePoints { get; set; }
|
||||||
|
public required SearchResultToken[] Tokens { get; set; }
|
||||||
|
public required int PrefixMatchCount { get; set; }
|
||||||
|
public required int RangeCount { get; set; }
|
||||||
|
public required double MatchRatio { get; set; }
|
||||||
|
public required int MatchRatioLevel { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class InvertedIndexSearcher
|
||||||
|
{
|
||||||
|
public abstract class ComparableStateBase<T> : IComparable<T>
|
||||||
|
where T : ComparableStateBase<T>
|
||||||
|
{
|
||||||
|
protected abstract int GetRangeCount();
|
||||||
|
protected abstract int GetPrefixMatchCount();
|
||||||
|
protected abstract OffsetSpan GetFirstTokenDocumentOffset();
|
||||||
|
protected abstract OffsetSpan GetLastTokenDocumentOffset();
|
||||||
|
protected virtual SearchResultToken? GetLastToken() => null; // Not on intermediate results
|
||||||
|
protected virtual int? GetMatchRatioLevel() => null; // Not on intermediate/candidate results
|
||||||
|
protected abstract double GetMatchRatio();
|
||||||
|
protected virtual int FallbackCompareTo(T other) => 0; // Called when all other comparisons are equal
|
||||||
|
|
||||||
|
public int CompareTo(T other)
|
||||||
|
{
|
||||||
|
// Prefer matches that not relying on end-of-input loose matching (full match over prefix match)
|
||||||
|
SearchResultToken? aLastToken = GetLastToken(), bLastToken = other.GetLastToken();
|
||||||
|
if (aLastToken != null && bLastToken != null)
|
||||||
|
{
|
||||||
|
var aDidPrefixMatchByTokenType = aLastToken.IsTokenPrefixMatching && tokenTypePrefixMatchingPolicy[aLastToken.Definition.Type] == TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
|
||||||
|
var bDidPrefixMatchByTokenType = bLastToken.IsTokenPrefixMatching && tokenTypePrefixMatchingPolicy[bLastToken.Definition.Type] == TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
|
||||||
|
if (aDidPrefixMatchByTokenType != bDidPrefixMatchByTokenType) return aDidPrefixMatchByTokenType ? 1 : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer results that matched fewer discontinuous ranges over more
|
||||||
|
int aRangeCount = GetRangeCount(), bRangeCount = other.GetRangeCount();
|
||||||
|
if (aRangeCount != bRangeCount) return aRangeCount - bRangeCount;
|
||||||
|
|
||||||
|
// Prefer results that matches first token in document earlier over later
|
||||||
|
OffsetSpan aFirstTokenDocumentOffset = GetFirstTokenDocumentOffset(), bFirstTokenDocumentOffset = other.GetFirstTokenDocumentOffset();
|
||||||
|
if (aFirstTokenDocumentOffset.Start != bFirstTokenDocumentOffset.Start) return aFirstTokenDocumentOffset.Start - bFirstTokenDocumentOffset.Start;
|
||||||
|
|
||||||
|
// Prefer results that has higher match ratio (but don't distinguish similar ratios, so we introduced `matchRatioLevel`)
|
||||||
|
int? aMatchRatioLevel = GetMatchRatioLevel(), bMatchRatioLevel = other.GetMatchRatioLevel();
|
||||||
|
if (aMatchRatioLevel != null && bMatchRatioLevel != null)
|
||||||
|
{
|
||||||
|
if (aMatchRatioLevel.Value != bMatchRatioLevel.Value) return bMatchRatioLevel.Value - aMatchRatioLevel.Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer results that last token occurred earlier (if same, ended earlier) in the document over later
|
||||||
|
OffsetSpan aLastTokenDocumentOffset = GetLastTokenDocumentOffset(), bLastTokenDocumentOffset = other.GetLastTokenDocumentOffset();
|
||||||
|
if (aLastTokenDocumentOffset.Start != bLastTokenDocumentOffset.Start) return aLastTokenDocumentOffset.Start - bLastTokenDocumentOffset.Start;
|
||||||
|
if (aLastTokenDocumentOffset.End != bLastTokenDocumentOffset.End) return aLastTokenDocumentOffset.End - bLastTokenDocumentOffset.End;
|
||||||
|
|
||||||
|
// Prefer results that has higher match ratio (precisely)
|
||||||
|
double aMatchRatio = GetMatchRatio(), bMatchRatio = other.GetMatchRatio();
|
||||||
|
if (aMatchRatio != bMatchRatio) return bMatchRatio < aMatchRatio ? -1 : bMatchRatio > aMatchRatio ? 1 : 0;
|
||||||
|
|
||||||
|
return FallbackCompareTo(other);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class IntermediateResult : ComparableStateBase<IntermediateResult>
|
||||||
|
{
|
||||||
|
public required IntermediateResult? PreviousState { get; init; }
|
||||||
|
public required OffsetSpan FirstTokenDocumentOffset { get; init; }
|
||||||
|
public required int RangeCount { get; init; }
|
||||||
|
public required int TokenCount { get; init; }
|
||||||
|
public required int PrefixMatchCount { get; init; }
|
||||||
|
public required double MatchedTokenLength { get; init; }
|
||||||
|
public required int TokenId { get; init; }
|
||||||
|
public required OffsetSpan DocumentOffset { get; init; }
|
||||||
|
public required OffsetSpan InputOffset { get; init; }
|
||||||
|
public required bool IsTokenPrefixMatching { get; init; }
|
||||||
|
|
||||||
|
protected override int GetRangeCount() => RangeCount;
|
||||||
|
protected override int GetPrefixMatchCount() => PrefixMatchCount;
|
||||||
|
protected override OffsetSpan GetFirstTokenDocumentOffset() => FirstTokenDocumentOffset;
|
||||||
|
protected override OffsetSpan GetLastTokenDocumentOffset() => DocumentOffset;
|
||||||
|
protected override double GetMatchRatio() => MatchedTokenLength; // No need to divide document length since intermediate results are for same document
|
||||||
|
}
|
||||||
|
|
||||||
|
public class CandidateResult : ComparableStateBase<CandidateResult>
|
||||||
|
{
|
||||||
|
public required SearchResultToken[] Tokens { get; init; }
|
||||||
|
public required int PrefixMatchCount { get; init; }
|
||||||
|
public required double MatchedTokenLength { get; init; }
|
||||||
|
public required int RangeCount { get; init; }
|
||||||
|
|
||||||
|
protected override int GetRangeCount() => RangeCount;
|
||||||
|
protected override int GetPrefixMatchCount() => PrefixMatchCount;
|
||||||
|
protected override OffsetSpan GetFirstTokenDocumentOffset() => Tokens[0].DocumentOffset;
|
||||||
|
protected override OffsetSpan GetLastTokenDocumentOffset() => Tokens[^1].DocumentOffset;
|
||||||
|
protected override SearchResultToken? GetLastToken() => Tokens[^1];
|
||||||
|
protected override double GetMatchRatio() => MatchedTokenLength; // No need to divide document length since intermediate results are for same document
|
||||||
|
}
|
||||||
|
|
||||||
|
public class FinalResult : ComparableStateBase<FinalResult>
|
||||||
|
{
|
||||||
|
public required SearchResult Result { get; init; }
|
||||||
|
|
||||||
|
protected override int GetRangeCount() => Result.RangeCount;
|
||||||
|
protected override int GetPrefixMatchCount() => Result.PrefixMatchCount;
|
||||||
|
protected override OffsetSpan GetFirstTokenDocumentOffset() => Result.Tokens[0].DocumentOffset;
|
||||||
|
protected override OffsetSpan GetLastTokenDocumentOffset() => Result.Tokens[^1].DocumentOffset;
|
||||||
|
protected override SearchResultToken? GetLastToken() => Result.Tokens[^1];
|
||||||
|
protected override double GetMatchRatio() => Result.MatchRatio;
|
||||||
|
protected override int? GetMatchRatioLevel() => Result.MatchRatioLevel;
|
||||||
|
protected override int FallbackCompareTo(FinalResult other) => string.Compare(Result.DocumentText, other.Result.DocumentText, StringComparison.InvariantCulture);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsIgnorableCodePoint(int codePoint) => CommonUtils.IsWhitespace(codePoint) || codePoint == 0x3099 || codePoint == 0x309A;
|
||||||
|
|
||||||
|
public enum TokenTypePrefixMatchingPolicy {
|
||||||
|
AlwaysAllow,
|
||||||
|
NeverAllow,
|
||||||
|
AllowOnlyAtInputEnd,
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dictionary<TokenType, TokenTypePrefixMatchingPolicy> tokenTypePrefixMatchingPolicy = new()
|
||||||
|
{
|
||||||
|
[TokenType.Romaji] = TokenTypePrefixMatchingPolicy.NeverAllow,
|
||||||
|
[TokenType.Kana] = TokenTypePrefixMatchingPolicy.AlwaysAllow,
|
||||||
|
// These token types are in an "other" Trie
|
||||||
|
[TokenType.Han] = TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
|
||||||
|
[TokenType.Pinyin] = TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd,
|
||||||
|
[TokenType.Raw] = TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
|
||||||
|
};
|
||||||
|
|
||||||
|
private static bool ShouldAllowPrefixMatching(TokenType tokenType, bool isAtInputEnd) =>
|
||||||
|
tokenTypePrefixMatchingPolicy[tokenType] == TokenTypePrefixMatchingPolicy.AlwaysAllow ||
|
||||||
|
(tokenTypePrefixMatchingPolicy[tokenType] != TokenTypePrefixMatchingPolicy.NeverAllow && isAtInputEnd);
|
||||||
|
|
||||||
|
private static bool HasNonEmptyCharacters(int[] documentCodePoints, int start, int end) =>
|
||||||
|
start != end && !documentCodePoints.Skip(start).Take(end - start).All(CommonUtils.IsWhitespace);
|
||||||
|
|
||||||
|
public static SearchResult[] Search(LoadedInvertedIndex invertedIndex, string text)
|
||||||
|
{
|
||||||
|
var documents = invertedIndex.Documents;
|
||||||
|
var documentCodePoints = invertedIndex.DocumentCodePoints;
|
||||||
|
var tokenDefinitions = invertedIndex.TokenDefinitions;
|
||||||
|
var tries = invertedIndex.Tries;
|
||||||
|
|
||||||
|
var codePoints = text.ToCodePoints().Select(CommonNormalization.NormalizeCodePoint).Select(CommonNormalization.ToKatakana).ToArray();
|
||||||
|
// dp[i] = docId => end => IntermediateResult, starts from dp[-1] (l === 0), ends at dp[N - 1] (r === N - 1)
|
||||||
|
var dp = Enumerable.Range(0, codePoints.Length).Select(l => new Dictionary<int, Dictionary<int, IntermediateResult>>()).ToArray();
|
||||||
|
for (var l = 0; l < codePoints.Length; l++)
|
||||||
|
{
|
||||||
|
if (l != 0 && dp[l - 1].Count == 0) continue; // No documents match input from beginning to this position
|
||||||
|
var romajiNode = tries.Romaji;
|
||||||
|
var kanaNode = tries.Kana;
|
||||||
|
var otherNode = tries.Other;
|
||||||
|
for (var r = l; r < codePoints.Length && (romajiNode != null || kanaNode != null || otherNode != null); r++) // [l, r]
|
||||||
|
{
|
||||||
|
var codePoint = codePoints[r];
|
||||||
|
romajiNode = romajiNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
|
||||||
|
kanaNode = kanaNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
|
||||||
|
otherNode = otherNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
|
||||||
|
var reachingInputEnd = r == codePoints.Length - 1;
|
||||||
|
HashSet<int> matchingTokenIds =
|
||||||
|
[
|
||||||
|
// Allow suffix matching of romaji/other tokens if we're at the end of the input
|
||||||
|
.. romajiNode.GetTokenIds(ShouldAllowPrefixMatching(TokenType.Romaji, reachingInputEnd)),
|
||||||
|
.. kanaNode.GetTokenIds(ShouldAllowPrefixMatching(TokenType.Kana, reachingInputEnd)),
|
||||||
|
.. otherNode.GetTokenIds(reachingInputEnd),
|
||||||
|
];
|
||||||
|
foreach (var tokenId in matchingTokenIds) foreach (var reference in tokenDefinitions[tokenId].References)
|
||||||
|
{
|
||||||
|
var isTokenPrefixMatching = !romajiNode.IsTokenExactMatch(tokenId) && !kanaNode.IsTokenExactMatch(tokenId) && !otherNode.IsTokenExactMatch(tokenId);
|
||||||
|
var previousMatchesOfDocument = l != 0 && dp[l - 1].TryGetValue(reference.DocumentId, out var previousMatches) ? previousMatches : null;
|
||||||
|
if (l != 0 && previousMatchesOfDocument == null) continue;
|
||||||
|
foreach (var documentOffset in reference.Offsets)
|
||||||
|
{
|
||||||
|
int currentStart = documentOffset.Start, currentEnd = documentOffset.End;
|
||||||
|
if (l == 0) ContributeNextMatchingState(null);
|
||||||
|
else foreach (var (previousEnd, previousMatch) in previousMatchesOfDocument!) if (currentStart >= previousEnd) ContributeNextMatchingState(previousMatch);
|
||||||
|
void ContributeNextMatchingState(IntermediateResult? previousState)
|
||||||
|
{
|
||||||
|
var nextMatchingMap = dp[r];
|
||||||
|
if (!nextMatchingMap.TryGetValue(reference.DocumentId, out var nextMatches)) nextMatches = nextMatchingMap[reference.DocumentId] = [];
|
||||||
|
var oldResult = nextMatches.TryGetValue(currentEnd, out var result) ? result : null;
|
||||||
|
var inputOffset = new OffsetSpan { Start = l, End = r + 1 };
|
||||||
|
var newResult = new IntermediateResult
|
||||||
|
{
|
||||||
|
PreviousState = previousState,
|
||||||
|
FirstTokenDocumentOffset = previousState?.FirstTokenDocumentOffset ?? documentOffset,
|
||||||
|
RangeCount = previousState == null ? 1 :
|
||||||
|
previousState.RangeCount + (HasNonEmptyCharacters(documentCodePoints[reference.DocumentId], previousState.DocumentOffset.End, currentStart) ? 1 : 0),
|
||||||
|
TokenCount = (previousState?.TokenCount ?? 0) + 1,
|
||||||
|
PrefixMatchCount = (previousState?.PrefixMatchCount ?? 0) + (isTokenPrefixMatching ? 1 : 0),
|
||||||
|
MatchedTokenLength = (previousState?.MatchedTokenLength ?? 0) + documentOffset.Length *
|
||||||
|
Math.Min(isTokenPrefixMatching ? (double)inputOffset.Length / tokenDefinitions[tokenId].CodePointLength : double.PositiveInfinity, 1),
|
||||||
|
TokenId = tokenId,
|
||||||
|
DocumentOffset = documentOffset,
|
||||||
|
InputOffset = inputOffset,
|
||||||
|
IsTokenPrefixMatching = isTokenPrefixMatching,
|
||||||
|
};
|
||||||
|
nextMatches[currentEnd] = oldResult == null || newResult.CompareTo(oldResult) < 0 ? newResult : oldResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build search results and sort documents
|
||||||
|
return dp[codePoints.Length - 1].Select(entry =>
|
||||||
|
{
|
||||||
|
var (documentId, matches) = entry;
|
||||||
|
var sortedMatches = matches.Values.Select(match =>
|
||||||
|
{
|
||||||
|
var tokens = new List<SearchResultToken>();
|
||||||
|
// Build token list from backtracking
|
||||||
|
var state = match;
|
||||||
|
while (state != null)
|
||||||
|
{
|
||||||
|
tokens.Add(new SearchResultToken
|
||||||
|
{
|
||||||
|
Definition = tokenDefinitions[state.TokenId],
|
||||||
|
DocumentOffset = state.DocumentOffset, InputOffset = state.InputOffset,
|
||||||
|
IsTokenPrefixMatching = state.IsTokenPrefixMatching,
|
||||||
|
});
|
||||||
|
state = state.PreviousState;
|
||||||
|
}
|
||||||
|
tokens.Reverse();
|
||||||
|
return new CandidateResult
|
||||||
|
{
|
||||||
|
Tokens = tokens.ToArray(),
|
||||||
|
PrefixMatchCount = match.PrefixMatchCount,
|
||||||
|
MatchedTokenLength = match.MatchedTokenLength,
|
||||||
|
RangeCount = match.RangeCount,
|
||||||
|
};
|
||||||
|
}).OrderBy(match => match);
|
||||||
|
var bestMatch = sortedMatches.First();
|
||||||
|
var documentText = documents[documentId];
|
||||||
|
var matchRatio = bestMatch.MatchedTokenLength / documentCodePoints[documentId].Length;
|
||||||
|
var matchRatioLevel = (int)Math.Round(matchRatio * 5);
|
||||||
|
return new FinalResult
|
||||||
|
{
|
||||||
|
Result = new SearchResult
|
||||||
|
{
|
||||||
|
DocumentId = documentId,
|
||||||
|
DocumentText = documentText,
|
||||||
|
DocumentCodePoints = documentCodePoints[documentId],
|
||||||
|
Tokens = bestMatch.Tokens,
|
||||||
|
PrefixMatchCount = bestMatch.PrefixMatchCount,
|
||||||
|
RangeCount = bestMatch.RangeCount,
|
||||||
|
MatchRatio = matchRatio,
|
||||||
|
MatchRatioLevel = matchRatioLevel,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}).OrderBy(result => result).Select(result => result.Result).ToArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<OutputType>Library</OutputType>
|
||||||
|
<RootNamespace>$(ProjectName).Searcher</RootNamespace>
|
||||||
|
<AssemblyName>$(RootNamespace)</AssemblyName>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageId>$(RootNamespace)</PackageId>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Common\MaigoLabs.NeedLe.Common.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="DotNetCampus.LatestCSharpFeatures" PrivateAssets="all" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Common.Types;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Searcher;
|
||||||
|
|
||||||
|
public class HighlightedTextPart
|
||||||
|
{
|
||||||
|
public required string Text { get; init; }
|
||||||
|
public required bool IsHighlighted { get; init; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class SearchResultHighlighter
|
||||||
|
{
|
||||||
|
public static List<HighlightedTextPart> Highlight(SearchResult resultDocument)
|
||||||
|
{
|
||||||
|
var result = new List<HighlightedTextPart>();
|
||||||
|
var previousHighlightEnd = 0;
|
||||||
|
foreach (var token in resultDocument.Tokens)
|
||||||
|
{
|
||||||
|
var notHighlightedText = resultDocument.DocumentCodePoints.Skip(previousHighlightEnd).Take(token.DocumentOffset.Start - previousHighlightEnd).ToUtf32String();
|
||||||
|
if (notHighlightedText.Length > 0) result.Add(new HighlightedTextPart { Text = notHighlightedText, IsHighlighted = false });
|
||||||
|
var highlightEnd = token.IsTokenPrefixMatching && token.Definition.Type == TokenType.Kana
|
||||||
|
? token.DocumentOffset.Start + Math.Max(
|
||||||
|
1,
|
||||||
|
(int)Math.Round(
|
||||||
|
token.DocumentOffset.Length *
|
||||||
|
Math.Min(1, (double)token.InputOffset.Length / token.Definition.CodePointLength)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
: token.DocumentOffset.End;
|
||||||
|
result.Add(new HighlightedTextPart { Text = resultDocument.DocumentCodePoints.Skip(token.DocumentOffset.Start).Take(highlightEnd - token.DocumentOffset.Start).ToUtf32String(), IsHighlighted = true });
|
||||||
|
previousHighlightEnd = highlightEnd;
|
||||||
|
}
|
||||||
|
if (previousHighlightEnd < resultDocument.DocumentCodePoints.Length) result.Add(new HighlightedTextPart { Text = resultDocument.DocumentCodePoints.Skip(previousHighlightEnd).ToUtf32String(), IsHighlighted = false });
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Searcher.Trie;
|
||||||
|
|
||||||
|
public class DeserializedTrie
|
||||||
|
{
|
||||||
|
public required TrieNode Root { get; set; }
|
||||||
|
public required Dictionary<int, int[]> TokenCodePoints { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class TrieDeserializer
|
||||||
|
{
|
||||||
|
public static DeserializedTrie Deserialize(int[] data)
|
||||||
|
{
|
||||||
|
var nodes = new List<TrieNode?>();
|
||||||
|
TrieNode GetNode(int id)
|
||||||
|
{
|
||||||
|
if (id > nodes.Count) nodes.AddRange(Enumerable.Repeat<TrieNode?>(null, id - nodes.Count));
|
||||||
|
return nodes[id - 1] ??= new TrieNode { Parent = null, Children = [], TokenIds = [], SubTreeTokenIds = [] };
|
||||||
|
}
|
||||||
|
var currentId = 0;
|
||||||
|
for (var i = 0; i < data.Length; )
|
||||||
|
{
|
||||||
|
var node = GetNode(++currentId);
|
||||||
|
var parentId = data[i++];
|
||||||
|
node.Parent = parentId != 0 ? GetNode(parentId) : null;
|
||||||
|
|
||||||
|
var endOfChildren = i;
|
||||||
|
while (endOfChildren < data.Length && data[endOfChildren] > 0) endOfChildren++;
|
||||||
|
var numberOfChildren = (endOfChildren - i) / 2;
|
||||||
|
for (var j = i; j < i + numberOfChildren; j++)
|
||||||
|
{
|
||||||
|
var codePoint = data[j];
|
||||||
|
var child = GetNode(data[j + numberOfChildren]);
|
||||||
|
node.Children.Add(codePoint, child);
|
||||||
|
}
|
||||||
|
i = endOfChildren;
|
||||||
|
|
||||||
|
if (data[i] == 0) i++; // No token IDs
|
||||||
|
else while (i < data.Length && data[i] < 0) node.TokenIds.Add(-data[i++] - 1);
|
||||||
|
}
|
||||||
|
var root = nodes[0]!;
|
||||||
|
|
||||||
|
// DFS to construct code point paths for each token
|
||||||
|
var tokenCodePoints = new Dictionary<int, int[]>();
|
||||||
|
var currentCodePoints = new List<int>();
|
||||||
|
void DfsCodePoints(TrieNode node)
|
||||||
|
{
|
||||||
|
foreach (var tokenId in node.TokenIds) tokenCodePoints.Add(tokenId, [.. currentCodePoints]);
|
||||||
|
foreach (var (codePoint, child) in node.Children)
|
||||||
|
{
|
||||||
|
if (child.Parent != node) continue; // Skip grafted paths as these are not the canonical representation of the tokens
|
||||||
|
currentCodePoints.Add(codePoint);
|
||||||
|
DfsCodePoints(child);
|
||||||
|
currentCodePoints.RemoveAt(currentCodePoints.Count - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DfsCodePoints(root);
|
||||||
|
|
||||||
|
// DFS to construct subTreeTokenIds for each node
|
||||||
|
var visitedNodes = new HashSet<TrieNode>();
|
||||||
|
List<int> DfsSubTreeTokenIds(TrieNode node)
|
||||||
|
{
|
||||||
|
if (visitedNodes.Contains(node)) return node.SubTreeTokenIds;
|
||||||
|
visitedNodes.Add(node);
|
||||||
|
node.SubTreeTokenIds = new HashSet<int>(node.TokenIds.Concat(node.Children.Values.SelectMany(DfsSubTreeTokenIds))).ToList();
|
||||||
|
return node.SubTreeTokenIds;
|
||||||
|
};
|
||||||
|
DfsSubTreeTokenIds(root);
|
||||||
|
|
||||||
|
return new DeserializedTrie { Root = root, TokenCodePoints = tokenCodePoints };
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,126 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Common;
|
||||||
|
|
||||||
|
#region ToKatakana
|
||||||
|
|
||||||
|
public sealed class ToKatakana_ConvertsHiraganaToKatakanaTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("アイウエオ", CommonNormalization.ToKatakana("あいうえお"));
|
||||||
|
Assert.Equal("カキクケコ", CommonNormalization.ToKatakana("かきくけこ"));
|
||||||
|
Assert.Equal("サシスセソ", CommonNormalization.ToKatakana("さしすせそ"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToKatakana_KeepsKatakanaUnchangedTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("アイウエオ", CommonNormalization.ToKatakana("アイウエオ"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToKatakana_KeepsNonKanaUnchangedTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("abc123", CommonNormalization.ToKatakana("abc123"));
|
||||||
|
Assert.Equal("漢字", CommonNormalization.ToKatakana("漢字"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToKatakana_HandlesMixedInputTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("アアa漢", CommonNormalization.ToKatakana("あアa漢"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region NormalizeCodePoint
|
||||||
|
|
||||||
|
public sealed class NormalizeCodePoint_ConvertsFullwidthAsciiToHalfwidthLowercaseTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal('a', CommonNormalization.NormalizeCodePoint('A'));
|
||||||
|
Assert.Equal('b', CommonNormalization.NormalizeCodePoint('B'));
|
||||||
|
Assert.Equal('c', CommonNormalization.NormalizeCodePoint('C'));
|
||||||
|
Assert.Equal('1', CommonNormalization.NormalizeCodePoint('1'));
|
||||||
|
Assert.Equal('2', CommonNormalization.NormalizeCodePoint('2'));
|
||||||
|
Assert.Equal('3', CommonNormalization.NormalizeCodePoint('3'));
|
||||||
|
Assert.Equal('!', CommonNormalization.NormalizeCodePoint('!'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class NormalizeCodePoint_ConvertsFullwidthSpaceToHalfwidthTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal(' ', CommonNormalization.NormalizeCodePoint(' '));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class NormalizeCodePoint_ConvertsHalfwidthKanaToFullwidthTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal('ア', CommonNormalization.NormalizeCodePoint('ア'));
|
||||||
|
Assert.Equal('イ', CommonNormalization.NormalizeCodePoint('イ'));
|
||||||
|
Assert.Equal('ウ', CommonNormalization.NormalizeCodePoint('ウ'));
|
||||||
|
Assert.Equal('エ', CommonNormalization.NormalizeCodePoint('エ'));
|
||||||
|
Assert.Equal('オ', CommonNormalization.NormalizeCodePoint('オ'));
|
||||||
|
Assert.Equal('カ', CommonNormalization.NormalizeCodePoint('カ'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class NormalizeCodePoint_NormalizesVoicedSoundMarksTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal(0x3099, CommonNormalization.NormalizeCodePoint('゙')); // halfwidth voiced -> combining
|
||||||
|
Assert.Equal(0x309A, CommonNormalization.NormalizeCodePoint('゚')); // halfwidth semi-voiced -> combining
|
||||||
|
Assert.Equal(0x3099, CommonNormalization.NormalizeCodePoint('゛')); // fullwidth voiced -> combining
|
||||||
|
Assert.Equal(0x309A, CommonNormalization.NormalizeCodePoint('゜')); // fullwidth semi-voiced -> combining
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class NormalizeCodePoint_ConvertsHalfwidthPunctuationToFullwidthTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal('。', CommonNormalization.NormalizeCodePoint('。'));
|
||||||
|
Assert.Equal('「', CommonNormalization.NormalizeCodePoint('「'));
|
||||||
|
Assert.Equal('」', CommonNormalization.NormalizeCodePoint('」'));
|
||||||
|
Assert.Equal('、', CommonNormalization.NormalizeCodePoint('、'));
|
||||||
|
Assert.Equal('・', CommonNormalization.NormalizeCodePoint('・'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class NormalizeCodePoint_LowercasesRegularAsciiTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal('a', CommonNormalization.NormalizeCodePoint('A'));
|
||||||
|
Assert.Equal('b', CommonNormalization.NormalizeCodePoint('B'));
|
||||||
|
Assert.Equal('c', CommonNormalization.NormalizeCodePoint('C'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,91 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer;
|
||||||
|
using MaigoLabs.NeedLe.Searcher;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.E2E;
|
||||||
|
|
||||||
|
public sealed class Search_MatchesWithMixedSearchQueryTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
private static readonly string[] TestDocuments =
|
||||||
|
[
|
||||||
|
"ミーティア",
|
||||||
|
"エンドマークに希望と涙を添えて",
|
||||||
|
"宵の鳥",
|
||||||
|
"僕の和風本当上手",
|
||||||
|
];
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var compressed = InvertedIndexBuilder.BuildInvertedIndex(TestDocuments, TokenizerOptions);
|
||||||
|
var invertedIndex = InvertedIndexLoader.Load(compressed);
|
||||||
|
|
||||||
|
var results = InvertedIndexSearcher.Search(invertedIndex, "bokunoh风じょう");
|
||||||
|
|
||||||
|
// Should have at least one result
|
||||||
|
Assert.NotEmpty(results);
|
||||||
|
|
||||||
|
// The first result should be "僕の和風本当上手"
|
||||||
|
Assert.Equal("僕の和風本当上手", results[0].DocumentText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class Search_HighlightsSearchResultCorrectlyTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
private static readonly string[] TestDocuments =
|
||||||
|
[
|
||||||
|
"ミーティア",
|
||||||
|
"エンドマークに希望と涙を添えて",
|
||||||
|
"宵の鳥",
|
||||||
|
"僕の和風本当上手",
|
||||||
|
];
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var compressed = InvertedIndexBuilder.BuildInvertedIndex(TestDocuments, TokenizerOptions);
|
||||||
|
var invertedIndex = InvertedIndexLoader.Load(compressed);
|
||||||
|
|
||||||
|
var results = InvertedIndexSearcher.Search(invertedIndex, "bokunoh风じょう");
|
||||||
|
Assert.NotEmpty(results);
|
||||||
|
|
||||||
|
var highlighted = SearchResultHighlighter.Highlight(results[0]);
|
||||||
|
|
||||||
|
// Should be a list of parts
|
||||||
|
Assert.NotEmpty(highlighted);
|
||||||
|
|
||||||
|
// Collect highlighted text
|
||||||
|
var highlightedTexts = highlighted.Where(p => p.IsHighlighted).Select(p => p.Text).ToList();
|
||||||
|
var highlightedJoined = string.Join("", highlightedTexts);
|
||||||
|
|
||||||
|
Assert.Contains("僕", highlightedJoined);
|
||||||
|
Assert.Contains("の", highlightedJoined);
|
||||||
|
Assert.Contains("和", highlightedJoined);
|
||||||
|
Assert.Contains("風", highlightedJoined);
|
||||||
|
Assert.Contains("上", highlightedJoined);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class Search_MatchesRomajiInputToKanaDocumentsTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
private static readonly string[] TestDocuments =
|
||||||
|
[
|
||||||
|
"ミーティア",
|
||||||
|
"エンドマークに希望と涙を添えて",
|
||||||
|
"宵の鳥",
|
||||||
|
"僕の和風本当上手",
|
||||||
|
];
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var compressed = InvertedIndexBuilder.BuildInvertedIndex(TestDocuments, TokenizerOptions);
|
||||||
|
var invertedIndex = InvertedIndexLoader.Load(compressed);
|
||||||
|
|
||||||
|
// Search for "yoi" should match "宵の鳥"
|
||||||
|
var results = InvertedIndexSearcher.Search(invertedIndex, "yoi");
|
||||||
|
var matchedTexts = results.Select(r => r.DocumentText).ToList();
|
||||||
|
|
||||||
|
Assert.Contains("宵の鳥", matchedTexts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,143 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Trie;
|
||||||
|
using MaigoLabs.NeedLe.Searcher.Trie;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.E2E;
|
||||||
|
|
||||||
|
#region Trie Building
|
||||||
|
|
||||||
|
public sealed class TrieBuilding_BuildsTrieWithMultipleDifferentTokensTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var trie = TrieBuilder.BuildTrie([
|
||||||
|
(0, "hello".ToCodePoints()),
|
||||||
|
(1, "help".ToCodePoints()),
|
||||||
|
(2, "world".ToCodePoints()),
|
||||||
|
(3, "word".ToCodePoints()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Traverse to verify structure
|
||||||
|
var helloNode = trie.Traverse("hello".ToCodePoints().ToArray());
|
||||||
|
var helpNode = trie.Traverse("help".ToCodePoints().ToArray());
|
||||||
|
var worldNode = trie.Traverse("world".ToCodePoints().ToArray());
|
||||||
|
var wordNode = trie.Traverse("word".ToCodePoints().ToArray());
|
||||||
|
|
||||||
|
Assert.NotNull(helloNode);
|
||||||
|
Assert.NotNull(helpNode);
|
||||||
|
Assert.NotNull(worldNode);
|
||||||
|
Assert.NotNull(wordNode);
|
||||||
|
|
||||||
|
// Check token IDs
|
||||||
|
Assert.Contains(0, helloNode!.TokenIds);
|
||||||
|
Assert.Contains(1, helpNode!.TokenIds);
|
||||||
|
Assert.Contains(2, worldNode!.TokenIds);
|
||||||
|
Assert.Contains(3, wordNode!.TokenIds);
|
||||||
|
|
||||||
|
// Check that 'hel' prefix node has both tokens in subTree
|
||||||
|
var helNode = trie.Traverse("hel".ToCodePoints().ToArray());
|
||||||
|
Assert.NotNull(helNode);
|
||||||
|
Assert.Contains(0, helNode!.SubTreeTokenIds);
|
||||||
|
Assert.Contains(1, helNode.SubTreeTokenIds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class TrieBuilding_HandlesJapaneseTextTokensTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var trie = TrieBuilder.BuildTrie([
|
||||||
|
(0, "さくら".ToCodePoints()),
|
||||||
|
(1, "サクラ".ToCodePoints()),
|
||||||
|
(2, "桜".ToCodePoints()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
Assert.Contains(0, trie.Traverse("さくら".ToCodePoints().ToArray())?.TokenIds ?? []);
|
||||||
|
Assert.Contains(1, trie.Traverse("サクラ".ToCodePoints().ToArray())?.TokenIds ?? []);
|
||||||
|
Assert.Contains(2, trie.Traverse("桜".ToCodePoints().ToArray())?.TokenIds ?? []);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Trie Serialization
|
||||||
|
|
||||||
|
public sealed class TrieSerialization_SerializesAndDeserializesCorrectlyTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var originalTrie = TrieBuilder.BuildTrie([
|
||||||
|
(0, "apple".ToCodePoints()),
|
||||||
|
(1, "app".ToCodePoints()),
|
||||||
|
(2, "banana".ToCodePoints()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Serialize
|
||||||
|
var serialized = TrieSerializer.Serialize(originalTrie);
|
||||||
|
Assert.True(serialized.Length > 0);
|
||||||
|
|
||||||
|
// Deserialize
|
||||||
|
var deserialized = TrieDeserializer.Deserialize(serialized);
|
||||||
|
var deserializedTrie = deserialized.Root;
|
||||||
|
var tokenCodePoints = deserialized.TokenCodePoints;
|
||||||
|
|
||||||
|
// Verify structure is preserved
|
||||||
|
var appleNode = deserializedTrie.Traverse("apple".ToCodePoints().ToArray());
|
||||||
|
var appNode = deserializedTrie.Traverse("app".ToCodePoints().ToArray());
|
||||||
|
var bananaNode = deserializedTrie.Traverse("banana".ToCodePoints().ToArray());
|
||||||
|
|
||||||
|
Assert.NotNull(appleNode);
|
||||||
|
Assert.NotNull(appNode);
|
||||||
|
Assert.NotNull(bananaNode);
|
||||||
|
|
||||||
|
Assert.Contains(0, appleNode!.TokenIds);
|
||||||
|
Assert.Contains(1, appNode!.TokenIds);
|
||||||
|
Assert.Contains(2, bananaNode!.TokenIds);
|
||||||
|
|
||||||
|
// Verify tokenCodePoints map
|
||||||
|
Assert.Equal("apple", tokenCodePoints[0].ToUtf32String());
|
||||||
|
Assert.Equal("app", tokenCodePoints[1].ToUtf32String());
|
||||||
|
Assert.Equal("banana", tokenCodePoints[2].ToUtf32String());
|
||||||
|
|
||||||
|
// Verify subTreeTokenIds are reconstructed
|
||||||
|
Assert.Contains(0, appNode.SubTreeTokenIds);
|
||||||
|
Assert.Contains(1, appNode.SubTreeTokenIds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class TrieSerialization_PreservesParentReferencesAfterDeserializationTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var originalTrie = TrieBuilder.BuildTrie([
|
||||||
|
(0, "test".ToCodePoints()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
var serialized = TrieSerializer.Serialize(originalTrie);
|
||||||
|
var deserialized = TrieDeserializer.Deserialize(serialized);
|
||||||
|
var root = deserialized.Root;
|
||||||
|
|
||||||
|
var testNode = root.Traverse("test".ToCodePoints().ToArray());
|
||||||
|
Assert.NotNull(testNode);
|
||||||
|
|
||||||
|
// Walk back to root via parent references
|
||||||
|
TrieNode? node = testNode;
|
||||||
|
var depth = 0;
|
||||||
|
while (node?.Parent != null)
|
||||||
|
{
|
||||||
|
node = node.Parent;
|
||||||
|
depth++;
|
||||||
|
}
|
||||||
|
Assert.Equal(4, depth); // 't' -> 'e' -> 's' -> 't' -> root
|
||||||
|
Assert.Same(root, node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Indexer.Han;
|
||||||
|
|
||||||
|
#region IsHanCharacter
|
||||||
|
|
||||||
|
public sealed class IsHanCharacter_ReturnsTrueForCjkCharactersTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.True(HanVariantProvider.IsHanCharacter('中'));
|
||||||
|
Assert.True(HanVariantProvider.IsHanCharacter('国'));
|
||||||
|
Assert.True(HanVariantProvider.IsHanCharacter('日'));
|
||||||
|
Assert.True(HanVariantProvider.IsHanCharacter('本'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class IsHanCharacter_ReturnsFalseForNonCjkCharactersTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.False(HanVariantProvider.IsHanCharacter('a'));
|
||||||
|
Assert.False(HanVariantProvider.IsHanCharacter('あ'));
|
||||||
|
Assert.False(HanVariantProvider.IsHanCharacter('ア'));
|
||||||
|
Assert.False(HanVariantProvider.IsHanCharacter('1'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region GetHanVariants
|
||||||
|
|
||||||
|
public sealed class GetHanVariants_ReturnsVariantsForSimplifiedTraditionalTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var provider = new HanVariantProvider();
|
||||||
|
// 国 (simplified) and 國 (traditional) should be variants of each other
|
||||||
|
var variants1 = provider.GetHanVariants('国');
|
||||||
|
var variants2 = provider.GetHanVariants('國');
|
||||||
|
Assert.Contains('国', variants1);
|
||||||
|
Assert.Contains('國', variants1);
|
||||||
|
Assert.Contains('国', variants2);
|
||||||
|
Assert.Contains('國', variants2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GetHanVariants_ReturnsCharacterItselfForNoVariantsTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var provider = new HanVariantProvider();
|
||||||
|
var variants = provider.GetHanVariants('一');
|
||||||
|
Assert.Contains('一', variants);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GetHanVariants_ReturnsEmptyForNonHanCharactersTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var provider = new HanVariantProvider();
|
||||||
|
Assert.Empty(provider.GetHanVariants('a'));
|
||||||
|
Assert.Empty(provider.GetHanVariants('あ'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Indexer.Han;
|
||||||
|
|
||||||
|
public sealed class GetPinyinCandidates_ReturnsPinyinForHanCharacterTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var candidates = PinyinHelper.GetPinyinCandidates('中').ToList();
|
||||||
|
Assert.Contains("zhong", candidates);
|
||||||
|
Assert.Contains("zh", candidates); // initial
|
||||||
|
Assert.Contains("z", candidates); // first letter
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GetPinyinCandidates_ReturnsMultiplePinyinForPolyphonicTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
// 行 can be "xing" or "hang"
|
||||||
|
var candidates = PinyinHelper.GetPinyinCandidates('行').ToList();
|
||||||
|
Assert.Contains("xing", candidates);
|
||||||
|
Assert.Contains("hang", candidates);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GetPinyinCandidates_IncludesFuzzyPinyinVariantsTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
// 风 is "feng", should also have fuzzy variant "fen"
|
||||||
|
var candidates = PinyinHelper.GetPinyinCandidates('风').ToList();
|
||||||
|
Assert.Contains("feng", candidates);
|
||||||
|
Assert.Contains("fen", candidates); // fuzzy: eng -> en
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GetPinyinCandidates_ReturnsEmptyForNonHanCharactersTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Empty(PinyinHelper.GetPinyinCandidates('a'));
|
||||||
|
Assert.Empty(PinyinHelper.GetPinyinCandidates('あ'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Indexer.Han;
|
||||||
|
|
||||||
|
public sealed class UnionFindSet_FindsSelfAsRootInitiallyTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var ufs = new UnionFindSet();
|
||||||
|
Assert.Equal(1, ufs.Find(1));
|
||||||
|
Assert.Equal(2, ufs.Find(2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class UnionFindSet_UnionsTwoElementsTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var ufs = new UnionFindSet();
|
||||||
|
ufs.Union(1, 2);
|
||||||
|
Assert.Equal(ufs.Find(1), ufs.Find(2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class UnionFindSet_UnionsMultipleElementsTransitivelyTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var ufs = new UnionFindSet();
|
||||||
|
ufs.Union(1, 2);
|
||||||
|
ufs.Union(2, 3);
|
||||||
|
ufs.Union(4, 5);
|
||||||
|
Assert.Equal(ufs.Find(1), ufs.Find(3));
|
||||||
|
Assert.NotEqual(ufs.Find(1), ufs.Find(4));
|
||||||
|
ufs.Union(3, 4);
|
||||||
|
Assert.Equal(ufs.Find(1), ufs.Find(5));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class UnionFindSet_IteratesAllKeysTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var ufs = new UnionFindSet();
|
||||||
|
ufs.Union(1, 2);
|
||||||
|
ufs.Union(3, 4);
|
||||||
|
var keys = ufs.Keys.ToList();
|
||||||
|
Assert.Contains(1, keys);
|
||||||
|
Assert.Contains(2, keys);
|
||||||
|
Assert.Contains(3, keys);
|
||||||
|
Assert.Contains(4, keys);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Indexer.Japanese;
|
||||||
|
|
||||||
|
#region ToRomajiStrictly
|
||||||
|
|
||||||
|
public sealed class ToRomajiStrictly_ConvertsBasicKanaToRomajiTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("a", JapaneseUtils.ToRomajiStrictly("あ"));
|
||||||
|
Assert.Equal("ka", JapaneseUtils.ToRomajiStrictly("か"));
|
||||||
|
Assert.Equal("sakura", JapaneseUtils.ToRomajiStrictly("さくら"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToRomajiStrictly_ConvertsKatakanaToRomajiTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("a", JapaneseUtils.ToRomajiStrictly("ア"));
|
||||||
|
Assert.Equal("ka", JapaneseUtils.ToRomajiStrictly("カ"));
|
||||||
|
Assert.Equal("sakura", JapaneseUtils.ToRomajiStrictly("サクラ"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToRomajiStrictly_HandlesLongVowelsTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("ou", JapaneseUtils.ToRomajiStrictly("おう"));
|
||||||
|
Assert.Equal("oo", JapaneseUtils.ToRomajiStrictly("おお"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToRomajiStrictly_ReturnsEmptyForInvalidFirstCharacterTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("", JapaneseUtils.ToRomajiStrictly("ー")); // prolonged sound mark cannot be first
|
||||||
|
Assert.Equal("", JapaneseUtils.ToRomajiStrictly("ゃ")); // small ya cannot be first
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToRomajiStrictly_ReturnsEmptyForInvalidLastCharacterTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("", JapaneseUtils.ToRomajiStrictly("っ")); // small tsu cannot be last
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ToRomajiStrictly_HandlesGeminationTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
Assert.Equal("katta", JapaneseUtils.ToRomajiStrictly("かった"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Indexer.Japanese;
|
||||||
|
|
||||||
|
public sealed class GetAllKanaReadings_ReturnsKatakanaForPureKanaInputTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var provider = new TranscriptionProvider();
|
||||||
|
var readings = provider.GetAllKanaReadings("あ");
|
||||||
|
Assert.Contains("ア", readings);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GetAllKanaReadings_ReturnsReadingsForKanjiTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var provider = new TranscriptionProvider();
|
||||||
|
var readings = provider.GetAllKanaReadings("僕");
|
||||||
|
Assert.NotEmpty(readings);
|
||||||
|
// 僕 should have reading ボク
|
||||||
|
Assert.Contains("ボク", readings);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GetAllKanaReadings_ReturnsReadingsForCompoundWordsTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var provider = new TranscriptionProvider();
|
||||||
|
var readings = provider.GetAllKanaReadings("和風");
|
||||||
|
Assert.NotEmpty(readings);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,165 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common.Types;
|
||||||
|
using MaigoLabs.NeedLe.Indexer;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Indexer;
|
||||||
|
|
||||||
|
public sealed class Tokenizer_TokenizesMixedJapaneseTextTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var tokenizer = new Tokenizer(TokenizerOptions);
|
||||||
|
var tokens = tokenizer.Tokenize("僕の和風本当上手");
|
||||||
|
|
||||||
|
var tokenDefs = tokenizer.Tokens.Values.ToList();
|
||||||
|
|
||||||
|
// Should have tokens of various types
|
||||||
|
var types = tokenDefs.Select(t => t.Type).ToHashSet();
|
||||||
|
Assert.Contains(TokenType.Han, types);
|
||||||
|
Assert.Contains(TokenType.Pinyin, types);
|
||||||
|
Assert.Contains(TokenType.Kana, types);
|
||||||
|
Assert.Contains(TokenType.Romaji, types);
|
||||||
|
|
||||||
|
// Helper to get token texts at a specific position by type
|
||||||
|
List<string> GetTokenTextsAt(int pos, TokenType type) => tokens
|
||||||
|
.Where(t => t.Start <= pos && t.End > pos)
|
||||||
|
.Select(t => tokenDefs.First(d => d.Id == t.Id))
|
||||||
|
.Where(d => d.Type == type)
|
||||||
|
.Select(d => d.Text)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
// Position 0: 僕
|
||||||
|
Assert.Contains("僕", GetTokenTextsAt(0, TokenType.Han));
|
||||||
|
Assert.Contains("pu", GetTokenTextsAt(0, TokenType.Pinyin));
|
||||||
|
Assert.Contains("ボク", GetTokenTextsAt(0, TokenType.Kana));
|
||||||
|
Assert.Contains("boku", GetTokenTextsAt(0, TokenType.Romaji));
|
||||||
|
|
||||||
|
// Position 1: の (hiragana, no Han/Pinyin)
|
||||||
|
Assert.Empty(GetTokenTextsAt(1, TokenType.Han));
|
||||||
|
Assert.Empty(GetTokenTextsAt(1, TokenType.Pinyin));
|
||||||
|
Assert.Contains("ノ", GetTokenTextsAt(1, TokenType.Kana));
|
||||||
|
Assert.Contains("no", GetTokenTextsAt(1, TokenType.Romaji));
|
||||||
|
|
||||||
|
// Position 2: 和
|
||||||
|
Assert.Contains("和", GetTokenTextsAt(2, TokenType.Han));
|
||||||
|
Assert.Contains("he", GetTokenTextsAt(2, TokenType.Pinyin));
|
||||||
|
Assert.Contains("ワ", GetTokenTextsAt(2, TokenType.Kana));
|
||||||
|
Assert.Contains("wa", GetTokenTextsAt(2, TokenType.Romaji));
|
||||||
|
|
||||||
|
// Position 3: 風
|
||||||
|
Assert.Contains("風", GetTokenTextsAt(3, TokenType.Han));
|
||||||
|
Assert.Contains("风", GetTokenTextsAt(3, TokenType.Han)); // simplified variant
|
||||||
|
Assert.Contains("feng", GetTokenTextsAt(3, TokenType.Pinyin));
|
||||||
|
Assert.Contains("フウ", GetTokenTextsAt(3, TokenType.Kana));
|
||||||
|
Assert.Contains("fu", GetTokenTextsAt(3, TokenType.Romaji));
|
||||||
|
|
||||||
|
// Position 4: 本
|
||||||
|
Assert.Contains("本", GetTokenTextsAt(4, TokenType.Han));
|
||||||
|
Assert.Contains("ben", GetTokenTextsAt(4, TokenType.Pinyin));
|
||||||
|
Assert.Contains("ホン", GetTokenTextsAt(4, TokenType.Kana));
|
||||||
|
Assert.Contains("hon", GetTokenTextsAt(4, TokenType.Romaji));
|
||||||
|
|
||||||
|
// Position 5: 当
|
||||||
|
Assert.Contains("当", GetTokenTextsAt(5, TokenType.Han));
|
||||||
|
Assert.Contains("當", GetTokenTextsAt(5, TokenType.Han)); // traditional variant
|
||||||
|
Assert.Contains("dang", GetTokenTextsAt(5, TokenType.Pinyin));
|
||||||
|
Assert.Contains("トウ", GetTokenTextsAt(5, TokenType.Kana));
|
||||||
|
Assert.Contains("to", GetTokenTextsAt(5, TokenType.Romaji)); // normalized: tou -> to
|
||||||
|
|
||||||
|
// Position 6: 上
|
||||||
|
Assert.Contains("上", GetTokenTextsAt(6, TokenType.Han));
|
||||||
|
Assert.Contains("shang", GetTokenTextsAt(6, TokenType.Pinyin));
|
||||||
|
Assert.Contains("ジョウ", GetTokenTextsAt(6, TokenType.Kana));
|
||||||
|
Assert.Contains("jo", GetTokenTextsAt(6, TokenType.Romaji)); // normalized: jou -> jo
|
||||||
|
|
||||||
|
// Position 7: 手
|
||||||
|
Assert.Contains("手", GetTokenTextsAt(7, TokenType.Han));
|
||||||
|
Assert.Contains("shou", GetTokenTextsAt(7, TokenType.Pinyin));
|
||||||
|
Assert.Contains("シュ", GetTokenTextsAt(7, TokenType.Kana));
|
||||||
|
Assert.Contains("shu", GetTokenTextsAt(7, TokenType.Romaji));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class Tokenizer_NoDuplicateTokensTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var tokenizer = new Tokenizer(TokenizerOptions);
|
||||||
|
|
||||||
|
// Tokenize multiple music names that share some characters
|
||||||
|
tokenizer.Tokenize("僕の和風本当上手");
|
||||||
|
tokenizer.Tokenize("僕");
|
||||||
|
tokenizer.Tokenize("和風");
|
||||||
|
|
||||||
|
// Check that there are no duplicate tokens
|
||||||
|
var tokenDefs = tokenizer.Tokens.Values.ToList();
|
||||||
|
var tokenKeys = tokenDefs.Select(t => $"{t.Type}:{t.Text}").ToList();
|
||||||
|
var uniqueKeys = tokenKeys.ToHashSet();
|
||||||
|
|
||||||
|
Assert.Equal(uniqueKeys.Count, tokenKeys.Count);
|
||||||
|
|
||||||
|
// Also check that IDs are unique
|
||||||
|
var ids = tokenDefs.Select(t => t.Id).ToList();
|
||||||
|
var uniqueIds = ids.ToHashSet();
|
||||||
|
Assert.Equal(uniqueIds.Count, ids.Count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class Tokenizer_HandlesRawTokensForNonCjkTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var tokenizer = new Tokenizer(TokenizerOptions);
|
||||||
|
tokenizer.Tokenize("a-b");
|
||||||
|
|
||||||
|
var tokenDefs = tokenizer.Tokens.Values.ToList();
|
||||||
|
var rawTokenTexts = tokenDefs.Where(t => t.Type == TokenType.Raw).Select(t => t.Text).ToList();
|
||||||
|
|
||||||
|
Assert.Contains("a", rawTokenTexts);
|
||||||
|
Assert.Contains("-", rawTokenTexts);
|
||||||
|
Assert.Contains("b", rawTokenTexts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class Tokenizer_TokenizesCompoundWordKyouTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var tokenizer = new Tokenizer(TokenizerOptions);
|
||||||
|
var tokens = tokenizer.Tokenize("今日");
|
||||||
|
var tokenDefs = tokenizer.Tokens.Values.ToList();
|
||||||
|
|
||||||
|
// Helper to get tokens with specific type and span
|
||||||
|
List<string> GetTokensWithSpan(TokenType type, int start, int end) => tokens
|
||||||
|
.Where(t => t.Start == start && t.End == end)
|
||||||
|
.Select(t => tokenDefs.First(d => d.Id == t.Id))
|
||||||
|
.Where(d => d.Type == type)
|
||||||
|
.Select(d => d.Text)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
// Individual character readings at position 0: 今
|
||||||
|
Assert.Contains("今", GetTokensWithSpan(TokenType.Han, 0, 1));
|
||||||
|
Assert.Contains("jin", GetTokensWithSpan(TokenType.Pinyin, 0, 1));
|
||||||
|
Assert.Contains("コン", GetTokensWithSpan(TokenType.Kana, 0, 1));
|
||||||
|
Assert.Contains("イマ", GetTokensWithSpan(TokenType.Kana, 0, 1));
|
||||||
|
Assert.Contains("kon", GetTokensWithSpan(TokenType.Romaji, 0, 1));
|
||||||
|
Assert.Contains("ima", GetTokensWithSpan(TokenType.Romaji, 0, 1));
|
||||||
|
|
||||||
|
// Individual character readings at position 1: 日
|
||||||
|
Assert.Contains("日", GetTokensWithSpan(TokenType.Han, 1, 2));
|
||||||
|
Assert.Contains("ri", GetTokensWithSpan(TokenType.Pinyin, 1, 2));
|
||||||
|
Assert.Contains("ニチ", GetTokensWithSpan(TokenType.Kana, 1, 2));
|
||||||
|
Assert.Contains("ヒ", GetTokensWithSpan(TokenType.Kana, 1, 2));
|
||||||
|
Assert.Contains("niti", GetTokensWithSpan(TokenType.Romaji, 1, 2));
|
||||||
|
Assert.Contains("hi", GetTokensWithSpan(TokenType.Romaji, 1, 2));
|
||||||
|
|
||||||
|
// Combined reading for "今日" [0, 2] - this is an indivisible compound word
|
||||||
|
Assert.Contains("キョウ", GetTokensWithSpan(TokenType.Kana, 0, 2));
|
||||||
|
Assert.Contains("kyo", GetTokensWithSpan(TokenType.Romaji, 0, 2)); // normalized: kyou -> kyo
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
using MaigoLabs.NeedLe.Common;
|
||||||
|
using MaigoLabs.NeedLe.Common.Extensions;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Trie;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests.Indexer;
|
||||||
|
|
||||||
|
#region GraftTriePaths
|
||||||
|
|
||||||
|
public sealed class GraftTriePaths_GraftsPathsAccordingToNormalizationRulesTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
// Build a trie with tokens containing normalized forms
|
||||||
|
var trie = TrieBuilder.BuildTrie([
|
||||||
|
(0, "sya".ToCodePoints()), // normalized form of "sha"
|
||||||
|
(1, "tu".ToCodePoints()), // normalized form of "tsu"
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Graft paths so that "sha" -> "sya" and "tsu" -> "tu"
|
||||||
|
TrieBuilder.GraftTriePaths(trie, [
|
||||||
|
("sha".ToCodePoints().ToArray(), "sya".ToCodePoints().ToArray()),
|
||||||
|
("tsu".ToCodePoints().ToArray(), "tu".ToCodePoints().ToArray()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Now we should be able to traverse using both the original and grafted paths
|
||||||
|
var syaNode = trie.Traverse("sya".ToCodePoints().ToArray());
|
||||||
|
var shaNode = trie.Traverse("sha".ToCodePoints().ToArray());
|
||||||
|
Assert.NotNull(syaNode);
|
||||||
|
Assert.NotNull(shaNode);
|
||||||
|
Assert.Same(syaNode, shaNode); // Both paths should lead to the same node
|
||||||
|
|
||||||
|
var tuNode = trie.Traverse("tu".ToCodePoints().ToArray());
|
||||||
|
var tsuNode = trie.Traverse("tsu".ToCodePoints().ToArray());
|
||||||
|
Assert.NotNull(tuNode);
|
||||||
|
Assert.NotNull(tsuNode);
|
||||||
|
Assert.Same(tuNode, tsuNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class GraftTriePaths_HandlesChainedGraftRulesTest : NeedleTestBase
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Execute()
|
||||||
|
{
|
||||||
|
var trie = TrieBuilder.BuildTrie([
|
||||||
|
(0, "o".ToCodePoints()), // normalized vowel
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Chain: "ou" -> "o", "oo" -> "o"
|
||||||
|
TrieBuilder.GraftTriePaths(trie, [
|
||||||
|
("ou".ToCodePoints().ToArray(), "o".ToCodePoints().ToArray()),
|
||||||
|
("oo".ToCodePoints().ToArray(), "o".ToCodePoints().ToArray()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
var oNode = trie.Traverse("o".ToCodePoints().ToArray());
|
||||||
|
var ouNode = trie.Traverse("ou".ToCodePoints().ToArray());
|
||||||
|
var ooNode = trie.Traverse("oo".ToCodePoints().ToArray());
|
||||||
|
|
||||||
|
Assert.NotNull(oNode);
|
||||||
|
Assert.Same(oNode, ouNode);
|
||||||
|
Assert.Same(oNode, ooNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>net10.0</TargetFramework>
|
||||||
|
<RootNamespace>$(ProjectName).Tests</RootNamespace>
|
||||||
|
<AssemblyName>$(RootNamespace)</AssemblyName>
|
||||||
|
<IsPackable>false</IsPackable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="coverlet.collector" />
|
||||||
|
<PackageReference Include="Microsoft.NET.Test.Sdk" />
|
||||||
|
<PackageReference Include="xunit" />
|
||||||
|
<PackageReference Include="xunit.runner.visualstudio" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Common\MaigoLabs.NeedLe.Common.csproj" />
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Indexer\MaigoLabs.NeedLe.Indexer.csproj" />
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Searcher\MaigoLabs.NeedLe.Searcher.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Using Include="Xunit" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
using MaigoLabs.NeedLe.Indexer;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Han;
|
||||||
|
using MaigoLabs.NeedLe.Indexer.Japanese;
|
||||||
|
|
||||||
|
namespace MaigoLabs.NeedLe.Tests;
|
||||||
|
|
||||||
|
public abstract class NeedleTestBase
|
||||||
|
{
|
||||||
|
public static HanVariantProvider HanVariantProvider { get; set; } = new();
|
||||||
|
public static TranscriptionProvider TranscriptionProvider { get; set; } = new();
|
||||||
|
public static TokenizerOptions TokenizerOptions => new() { HanVariantProvider = HanVariantProvider, TranscriptionProvider = TranscriptionProvider };
|
||||||
|
}
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
<Solution>
|
||||||
|
|
||||||
|
<Configurations>
|
||||||
|
<Platform Name="Any CPU" />
|
||||||
|
<Platform Name="x64" />
|
||||||
|
<Platform Name="x86" />
|
||||||
|
</Configurations>
|
||||||
|
|
||||||
|
<Project Path="MaigoLabs.NeedLe/MaigoLabs.NeedLe.csproj" />
|
||||||
|
<Project Path="MaigoLabs.NeedLe.Common/MaigoLabs.NeedLe.Common.csproj" />
|
||||||
|
<Project Path="MaigoLabs.NeedLe.Indexer/MaigoLabs.NeedLe.Indexer.csproj" />
|
||||||
|
<Project Path="MaigoLabs.NeedLe.Searcher/MaigoLabs.NeedLe.Searcher.csproj" />
|
||||||
|
<Project Path="MaigoLabs.NeedLe.Playground/MaigoLabs.NeedLe.Playground.csproj" />
|
||||||
|
<Project Path="MaigoLabs.NeedLe.Tests/MaigoLabs.NeedLe.Tests.csproj" />
|
||||||
|
|
||||||
|
</Solution>
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<OutputType>Library</OutputType>
|
||||||
|
<RootNamespace>$(ProjectName)</RootNamespace>
|
||||||
|
<AssemblyName>$(RootNamespace)</AssemblyName>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageId>$(RootNamespace)</PackageId>
|
||||||
|
<IncludeBuildOutput>false</IncludeBuildOutput>
|
||||||
|
<IncludeContentInPack>false</IncludeContentInPack>
|
||||||
|
<NoPackageAnalysis>true</NoPackageAnalysis>
|
||||||
|
<MeCabUseDefaultDictionary>False</MeCabUseDefaultDictionary>
|
||||||
|
<PackageReadmeFile></PackageReadmeFile>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Common\MaigoLabs.NeedLe.Common.csproj" PrivateAssets="none" />
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Indexer\MaigoLabs.NeedLe.Indexer.csproj" PrivateAssets="none" />
|
||||||
|
<ProjectReference Include="..\MaigoLabs.NeedLe.Searcher\MaigoLabs.NeedLe.Searcher.csproj" PrivateAssets="none" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<!-- Directly add README to package files -->
|
||||||
|
<Target Name="AddReadmeToPackage" BeforeTargets="GenerateNuspec">
|
||||||
|
<ItemGroup>
|
||||||
|
<_PackageFiles Include="..\README.md" PackagePath="/" />
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup>
|
||||||
|
<PackageReadmeFile>README.md</PackageReadmeFile>
|
||||||
|
</PropertyGroup>
|
||||||
|
</Target>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
# `MaigoLabs.NeedLe`
|
||||||
|
|
||||||
|
Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.
|
||||||
|
|
||||||
|
See also [in-browser demo](https://needle.maigo.dev) (TypeScript version, but the same features as in C#).
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dotnet add package MaigoLabs.NeedLe
|
||||||
|
```
|
||||||
|
|
||||||
|
Or install sub-packages separately:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dotnet add package MaigoLabs.NeedLe.Indexer # For building indexes
|
||||||
|
dotnet add package MaigoLabs.NeedLe.Searcher # For searching only
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Indexing
|
||||||
|
|
||||||
|
Indexing requires dictionaries. These are installed as dependencies of the `MaigoLabs.NeedLe.Indexer` package:
|
||||||
|
|
||||||
|
* MeCab.DotNet
|
||||||
|
* OpenccNetLib
|
||||||
|
* hyjiacan.pinyin4net
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
using MaigoLabs.NeedLe.Indexer;
|
||||||
|
|
||||||
|
var documents = new[] { "你好世界", "こんにちは" };
|
||||||
|
var compressedIndex = InvertedIndexBuilder.BuildInvertedIndex(documents);
|
||||||
|
// To customize dictionary paths, pass the second argument `TokenizerOptions` to `BuildInvertedIndex`.
|
||||||
|
|
||||||
|
// The built index could be stored for later use, or sent to frontend to load with TypeScript package `@maigolabs/needle`.
|
||||||
|
// For compatibility with .NET Standard, we don't provide JSON related methods. You can use any JSON library to serialize/deserialize the index in the way you prefer.
|
||||||
|
var json = JsonSerializer.Serialize(compressedIndex);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Searching
|
||||||
|
|
||||||
|
Searching requires a prebuilt index but doesn't require dictionaries. Searcher is a lightweight package without dependencies.
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
using MaigoLabs.NeedLe.Searcher;
|
||||||
|
|
||||||
|
// Index returned by `BuildInvertedIndex`.
|
||||||
|
var index = InvertedIndexLoader.Load(compressedIndex);
|
||||||
|
|
||||||
|
var results = InvertedIndexSearcher.Search(index, "sekai");
|
||||||
|
foreach (var result in results) Console.WriteLine($"{result.DocumentText} ({result.MatchRatio:P0})")
|
||||||
|
// → 你好世界 (50%)
|
||||||
|
```
|
||||||
|
|
||||||
|
To highlight the search result, see also `SearchResultHighlighter`.
|
||||||
@@ -0,0 +1,157 @@
|
|||||||
|
import tsParser from '@typescript-eslint/parser';
|
||||||
|
import tsPlugin from '@typescript-eslint/eslint-plugin';
|
||||||
|
import importPlugin from 'eslint-plugin-import';
|
||||||
|
import stylisticPlugin from '@stylistic/eslint-plugin';
|
||||||
|
|
||||||
|
import type { Linter } from 'eslint';
|
||||||
|
|
||||||
|
const commonConfig: Linter.Config = {
|
||||||
|
plugins: {
|
||||||
|
import: importPlugin,
|
||||||
|
'@typescript-eslint': tsPlugin as any,
|
||||||
|
stylistic: stylisticPlugin,
|
||||||
|
},
|
||||||
|
rules: {
|
||||||
|
'import/order': [
|
||||||
|
'error',
|
||||||
|
{
|
||||||
|
groups: ['builtin', 'external', ['internal', 'parent', 'sibling', 'index']],
|
||||||
|
pathGroups: [
|
||||||
|
{
|
||||||
|
pattern: '@proj-marina/**',
|
||||||
|
group: 'internal',
|
||||||
|
position: 'before',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pattern: '@/**',
|
||||||
|
group: 'internal',
|
||||||
|
position: 'before',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
'newlines-between': 'always',
|
||||||
|
distinctGroup: false,
|
||||||
|
alphabetize: {
|
||||||
|
order: 'asc',
|
||||||
|
caseInsensitive: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
'import/no-duplicates': 'error',
|
||||||
|
|
||||||
|
'@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
|
||||||
|
'prefer-const': 'error',
|
||||||
|
'no-var': 'error',
|
||||||
|
'no-debugger': 'error',
|
||||||
|
'object-shorthand': 'error',
|
||||||
|
'prefer-template': 'error',
|
||||||
|
eqeqeq: ['error', 'always', { null: 'ignore' }],
|
||||||
|
|
||||||
|
'@typescript-eslint/prefer-optional-chain': 'error',
|
||||||
|
'@typescript-eslint/prefer-nullish-coalescing': 'error',
|
||||||
|
'@typescript-eslint/return-await': ['error', 'always'],
|
||||||
|
'@typescript-eslint/no-floating-promises': 'error',
|
||||||
|
'@typescript-eslint/await-thenable': 'error',
|
||||||
|
'@typescript-eslint/no-misused-promises': ['error'],
|
||||||
|
'@typescript-eslint/prefer-as-const': 'error',
|
||||||
|
'@typescript-eslint/prefer-for-of': 'error',
|
||||||
|
'@typescript-eslint/prefer-includes': 'error',
|
||||||
|
'@typescript-eslint/prefer-string-starts-ends-with': 'error',
|
||||||
|
'@typescript-eslint/consistent-type-imports': ['error', { disallowTypeAnnotations: false }],
|
||||||
|
|
||||||
|
'stylistic/indent': ['error', 2, {
|
||||||
|
'offsetTernaryExpressions': true
|
||||||
|
}],
|
||||||
|
'stylistic/linebreak-style': ['error', 'unix'],
|
||||||
|
'stylistic/semi': ['error', 'always'],
|
||||||
|
'stylistic/quotes': ['error', 'single', {
|
||||||
|
'avoidEscape': true,
|
||||||
|
'allowTemplateLiterals': 'avoidEscape',
|
||||||
|
}],
|
||||||
|
'stylistic/comma-dangle': ['error', 'always-multiline'],
|
||||||
|
'stylistic/arrow-parens': ['error', 'as-needed'],
|
||||||
|
'stylistic/object-curly-spacing': ['error', 'always'],
|
||||||
|
'stylistic/array-bracket-spacing': ['error', 'never'],
|
||||||
|
'stylistic/space-before-function-paren': ['error', {
|
||||||
|
'anonymous': 'always',
|
||||||
|
'named': 'never',
|
||||||
|
'asyncArrow': 'always',
|
||||||
|
}],
|
||||||
|
'stylistic/space-in-parens': ['error', 'never'],
|
||||||
|
'stylistic/comma-spacing': ['error', { 'before': false, 'after': true }],
|
||||||
|
'stylistic/key-spacing': ['error', { 'beforeColon': false, 'afterColon': true }],
|
||||||
|
'stylistic/keyword-spacing': ['error'],
|
||||||
|
'stylistic/space-before-blocks': ['error', 'always'],
|
||||||
|
'stylistic/space-infix-ops': ['error'],
|
||||||
|
'stylistic/no-trailing-spaces': ['error'],
|
||||||
|
'stylistic/eol-last': ['error', 'always'],
|
||||||
|
'stylistic/no-multiple-empty-lines': ['error', { 'max': 1, 'maxEOF': 0 }],
|
||||||
|
'stylistic/brace-style': ['error', '1tbs', { 'allowSingleLine': true }],
|
||||||
|
'stylistic/object-curly-newline': ['error', {
|
||||||
|
'ObjectExpression': { 'multiline': true, 'consistent': true },
|
||||||
|
'ObjectPattern': { 'multiline': true, 'consistent': true },
|
||||||
|
'ImportDeclaration': { 'multiline': true, 'consistent': true },
|
||||||
|
'ExportDeclaration': { 'multiline': true, 'consistent': true }
|
||||||
|
}],
|
||||||
|
'stylistic/array-bracket-newline': ['error', 'consistent'],
|
||||||
|
'stylistic/function-paren-newline': ['error', 'consistent'],
|
||||||
|
'stylistic/member-delimiter-style': ['error', {
|
||||||
|
'multiline': {
|
||||||
|
'delimiter': 'semi',
|
||||||
|
'requireLast': true
|
||||||
|
},
|
||||||
|
'singleline': {
|
||||||
|
'delimiter': 'semi',
|
||||||
|
'requireLast': false
|
||||||
|
}
|
||||||
|
}],
|
||||||
|
'stylistic/type-annotation-spacing': ['error'],
|
||||||
|
'stylistic/jsx-quotes': ['error', 'prefer-double'],
|
||||||
|
|
||||||
|
},
|
||||||
|
settings: {
|
||||||
|
'import/internal-regex': '^@proj-marina/',
|
||||||
|
'import/resolver': {
|
||||||
|
typescript: {
|
||||||
|
project: ['./apps/*/tsconfig.json', './packages/*/tsconfig.json'],
|
||||||
|
noWarnOnMultipleProjects: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const parserOptions: Linter.ParserOptions = {
|
||||||
|
parser: tsParser,
|
||||||
|
ecmaVersion: 'latest',
|
||||||
|
sourceType: 'module',
|
||||||
|
project: ['./apps/*/tsconfig.json', './packages/*/tsconfig.json'],
|
||||||
|
noWarnOnMultipleProjects: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
const config: Linter.Config[] = [
|
||||||
|
{
|
||||||
|
...commonConfig,
|
||||||
|
files: ['**/*.{ts,tsx}'],
|
||||||
|
languageOptions: {
|
||||||
|
parser: tsParser,
|
||||||
|
ecmaVersion: 'latest',
|
||||||
|
sourceType: 'module',
|
||||||
|
parserOptions,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ignores: [
|
||||||
|
'**/node_modules/**',
|
||||||
|
// Build output
|
||||||
|
'**/dist/**',
|
||||||
|
'**/build/**',
|
||||||
|
'**/coverage/**',
|
||||||
|
'eslint.config.ts',
|
||||||
|
'**/uno.config.ts',
|
||||||
|
'**/vite.config.ts',
|
||||||
|
'**/jest.config.ts',
|
||||||
|
'**/tsdown.config.ts',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
export default config;
|
||||||
+1585
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
"name": "@maigolabs/needle-root",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"build:packages": "pnpm -F=\"./packages/*\" run build",
|
||||||
|
"build:demo": "pnpm -F=\"./apps/demo\" build",
|
||||||
|
"dev:demo": "pnpm -F=\"./apps/demo\" dev",
|
||||||
|
"typecheck": "pnpm -rF=\"./packages/*\" -F=\"./apps/*\" typecheck",
|
||||||
|
"test": "pnpm -rF=\"./packages/*\" -F=\"./apps/*\" test",
|
||||||
|
"test:dotnet": "cd dotnet && dotnet test",
|
||||||
|
"lint": "eslint --cache --ext .",
|
||||||
|
"lint:fix": "eslint --cache --ext . --fix"
|
||||||
|
},
|
||||||
|
"license": "AGPL-3.0",
|
||||||
|
"packageManager": "pnpm@10.20.0",
|
||||||
|
"private": true,
|
||||||
|
"devDependencies": {
|
||||||
|
"@eslint/js": "^9.39.1",
|
||||||
|
"@stylistic/eslint-plugin": "^5.5.0",
|
||||||
|
"@typescript-eslint/eslint-plugin": "^8.46.3",
|
||||||
|
"@typescript-eslint/parser": "^8.46.3",
|
||||||
|
"cross-env": "^10.1.0",
|
||||||
|
"eslint": "^9.39.1",
|
||||||
|
"eslint-import-resolver-typescript": "^4.4.4",
|
||||||
|
"eslint-plugin-import": "^2.32.0",
|
||||||
|
"jiti": "^2.6.1",
|
||||||
|
"tsdown": "^0.18.4",
|
||||||
|
"tsx": "^4.21.0",
|
||||||
|
"typescript": "^5.9.3",
|
||||||
|
"unplugin-unused": "^0.5.6"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@types/node": "^24.10.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
Symlink
+1
@@ -0,0 +1 @@
|
|||||||
|
../../LICENSE
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
# `@maigolabs/needle`
|
||||||
|
|
||||||
|
Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.
|
||||||
|
|
||||||
|
See also [in-browser demo](https://needle.maigo.dev).
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
Dictionaries are installed as dependencies of the package, but if you don't use the indexer, they could be tree-shaken when bundling.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm install @maigolabs/needle
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Indexing
|
||||||
|
|
||||||
|
NeedLe uses Kuromoji for Japanese tokenization, which loads dictionaries dynamically. You need to create a Kuromoji `TokenizerBuilder` first:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
// In Node.js you can just load the dictionary from the file system.
|
||||||
|
|
||||||
|
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||||
|
|
||||||
|
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||||
|
const kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||||
|
|
||||||
|
// In browser you need to provide a custom loader to load the dictionary files with fetch().
|
||||||
|
|
||||||
|
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
|
||||||
|
// You can load dict files from CDN (See also the README of https://github.com/patdx/kuromoji.js)
|
||||||
|
const kuromoji = await new TokenizerBuilder({
|
||||||
|
loader: {
|
||||||
|
loadArrayBuffer: async (url: string) => {
|
||||||
|
url = `https://cdn.jsdelivr.net/npm/@aiktb/kuromoji@1.0.2/dict/${url.replace('.gz', '')}`;
|
||||||
|
const res = await fetch(url);
|
||||||
|
if (!res.ok) throw new Error(`Failed to fetch ${url}`);
|
||||||
|
return await res.arrayBuffer();
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).build();
|
||||||
|
```
|
||||||
|
|
||||||
|
After creating the Kuromoji instance, you can build the inverted index:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
import { buildInvertedIndex } from '@maigolabs/needle/indexer';
|
||||||
|
|
||||||
|
const documents = ['你好世界', 'こんにちは'];
|
||||||
|
const compressedIndex = buildInvertedIndex(documents, { kuromoji });
|
||||||
|
|
||||||
|
// The built index could be stored for later use.
|
||||||
|
const json = JSON.stringify(compressedIndex);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Searching
|
||||||
|
|
||||||
|
If you only import the searcher in your frontend code, indexer and dictionary-related dependencies will be tree-shaken.
|
||||||
|
|
||||||
|
```ts
|
||||||
|
import { loadInvertedIndex, searchInvertedIndex } from '@maigolabs/needle/searcher';
|
||||||
|
|
||||||
|
const loadedIndex = loadInvertedIndex(compressedIndex);
|
||||||
|
const results = searchInvertedIndex(loadedIndex, 'sekai');
|
||||||
|
for (const result of results) console.log(`${result.documentText} (${(result.matchRatio * 100).toFixed(0)}%)`);
|
||||||
|
// → 你好世界 (50%)
|
||||||
|
```
|
||||||
|
|
||||||
|
To highlight the search result, see also `highlightSearchResult`.
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
import type { Config } from 'jest';
|
||||||
|
|
||||||
|
const config: Config = {
|
||||||
|
preset: 'ts-jest/presets/default-esm',
|
||||||
|
testEnvironment: 'node',
|
||||||
|
extensionsToTreatAsEsm: ['.ts'],
|
||||||
|
moduleNameMapper: {
|
||||||
|
'^(\\.{1,2}/.*)\\.js$': '$1',
|
||||||
|
},
|
||||||
|
transform: {
|
||||||
|
'^.+\\.tsx?$': ['ts-jest', { useESM: true }],
|
||||||
|
},
|
||||||
|
testMatch: ['**/*.test.ts'],
|
||||||
|
testTimeout: 30000,
|
||||||
|
};
|
||||||
|
|
||||||
|
export default config;
|
||||||
|
|
||||||
@@ -0,0 +1,84 @@
|
|||||||
|
{
|
||||||
|
"name": "@maigolabs/needle",
|
||||||
|
"version": "1.0.1",
|
||||||
|
"description": "Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.",
|
||||||
|
"type": "module",
|
||||||
|
"main": "./src/index.ts",
|
||||||
|
"scripts": {
|
||||||
|
"build": "tsdown",
|
||||||
|
"typecheck": "tsc",
|
||||||
|
"test": "cross-env NODE_OPTIONS=--experimental-vm-modules jest",
|
||||||
|
"prepare": "pnpm run build"
|
||||||
|
},
|
||||||
|
"license": "AGPL-3.0",
|
||||||
|
"homepage": "https://needle.maigo.dev",
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "git+https://github.com/MaigoLabs/needLe.git",
|
||||||
|
"directory": "packages/needle"
|
||||||
|
},
|
||||||
|
"bugs": "https://github.com/MaigoLabs/needLe/issues",
|
||||||
|
"keywords": [
|
||||||
|
"needle",
|
||||||
|
"search",
|
||||||
|
"fuzzy",
|
||||||
|
"cjk",
|
||||||
|
"chinese",
|
||||||
|
"japanese",
|
||||||
|
"pinyin",
|
||||||
|
"romaji"
|
||||||
|
],
|
||||||
|
"author": "Menci <mencici@msn.com>",
|
||||||
|
"sideEffects": false,
|
||||||
|
"exports": {
|
||||||
|
".": "./src/index.ts",
|
||||||
|
"./common": "./src/common/index.ts",
|
||||||
|
"./indexer": "./src/indexer/index.ts",
|
||||||
|
"./searcher": "./src/searcher/index.ts",
|
||||||
|
"./package.json": "./package.json"
|
||||||
|
},
|
||||||
|
"packageManager": "pnpm@10.20.0",
|
||||||
|
"dependencies": {
|
||||||
|
"@patdx/kuromoji": "^1.0.4",
|
||||||
|
"hepburn": "^1.2.2",
|
||||||
|
"opencc-js": "^1.0.5",
|
||||||
|
"pinyin-pro": "^3.27.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/hepburn": "^1.2.2",
|
||||||
|
"@types/jest": "^30.0.0",
|
||||||
|
"@types/opencc-js": "^1.0.3",
|
||||||
|
"jest": "^30.2.0",
|
||||||
|
"ts-jest": "^29.4.6"
|
||||||
|
},
|
||||||
|
"files": [
|
||||||
|
"README.md",
|
||||||
|
"dist",
|
||||||
|
"package.json"
|
||||||
|
],
|
||||||
|
"publishConfig": {
|
||||||
|
"access": "public",
|
||||||
|
"main": "./dist/index.mjs",
|
||||||
|
"module": "./dist/index.mjs",
|
||||||
|
"types": "./dist/index.d.mts",
|
||||||
|
"exports": {
|
||||||
|
".": {
|
||||||
|
"types": "./dist/index.d.mts",
|
||||||
|
"default": "./dist/index.mjs"
|
||||||
|
},
|
||||||
|
"./common": {
|
||||||
|
"types": "./dist/common/index.d.mts",
|
||||||
|
"default": "./dist/common/index.mjs"
|
||||||
|
},
|
||||||
|
"./indexer": {
|
||||||
|
"types": "./dist/indexer/index.d.mts",
|
||||||
|
"default": "./dist/indexer/index.mjs"
|
||||||
|
},
|
||||||
|
"./searcher": {
|
||||||
|
"types": "./dist/searcher/index.d.mts",
|
||||||
|
"default": "./dist/searcher/index.mjs"
|
||||||
|
},
|
||||||
|
"./package.json": "./package.json"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
export * from './types';
|
||||||
|
export * from './utils';
|
||||||
|
export * from './normalize';
|
||||||
|
export * from './trie';
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
import { normalizeByCodePoint, toKatakana } from './normalize';
|
||||||
|
|
||||||
|
describe('toKatakana', () => {
|
||||||
|
it('should convert hiragana to katakana', () => {
|
||||||
|
expect(toKatakana('あいうえお')).toBe('アイウエオ');
|
||||||
|
expect(toKatakana('かきくけこ')).toBe('カキクケコ');
|
||||||
|
expect(toKatakana('さしすせそ')).toBe('サシスセソ');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should keep katakana unchanged', () => {
|
||||||
|
expect(toKatakana('アイウエオ')).toBe('アイウエオ');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should keep non-kana characters unchanged', () => {
|
||||||
|
expect(toKatakana('abc123')).toBe('abc123');
|
||||||
|
expect(toKatakana('漢字')).toBe('漢字');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle mixed input', () => {
|
||||||
|
expect(toKatakana('あアa漢')).toBe('アアa漢');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('normalizeByCodePoint', () => {
|
||||||
|
it('should convert fullwidth ASCII to halfwidth lowercase', () => {
|
||||||
|
expect(normalizeByCodePoint('ABC')).toBe('abc');
|
||||||
|
expect(normalizeByCodePoint('123')).toBe('123');
|
||||||
|
expect(normalizeByCodePoint('!@#')).toBe('!@#');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should convert fullwidth space to halfwidth space', () => {
|
||||||
|
expect(normalizeByCodePoint(' ')).toBe(' ');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should convert halfwidth kana to fullwidth kana', () => {
|
||||||
|
expect(normalizeByCodePoint('アイウエオ')).toBe('アイウエオ');
|
||||||
|
expect(normalizeByCodePoint('カキクケコ')).toBe('カキクケコ');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should normalize voiced/semi-voiced sound marks', () => {
|
||||||
|
expect(normalizeByCodePoint('゙')).toBe('\u3099'); // halfwidth voiced -> combining
|
||||||
|
expect(normalizeByCodePoint('゚')).toBe('\u309A'); // halfwidth semi-voiced -> combining
|
||||||
|
expect(normalizeByCodePoint('゛')).toBe('\u3099'); // fullwidth voiced -> combining
|
||||||
|
expect(normalizeByCodePoint('゜')).toBe('\u309A'); // fullwidth semi-voiced -> combining
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should convert halfwidth punctuation to fullwidth', () => {
|
||||||
|
expect(normalizeByCodePoint('。')).toBe('。');
|
||||||
|
expect(normalizeByCodePoint('「')).toBe('「');
|
||||||
|
expect(normalizeByCodePoint('」')).toBe('」');
|
||||||
|
expect(normalizeByCodePoint('、')).toBe('、');
|
||||||
|
expect(normalizeByCodePoint('・')).toBe('・');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should lowercase regular ASCII', () => {
|
||||||
|
expect(normalizeByCodePoint('ABC')).toBe('abc');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Should keep hiragana unchanged
|
||||||
|
});
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
export const normalizeByCodePoint = (string: string) => [...string].map(normalizeCodePoint).join('');
|
||||||
|
|
||||||
|
export const normalizeCodePoint = (char: string) => {
|
||||||
|
const codePoint = char.codePointAt(0)!;
|
||||||
|
// Fullwidth ASCII -> Halfwidth ASCII
|
||||||
|
if (codePoint >= 0xFF01 && codePoint <= 0xFF5E) return String.fromCodePoint(codePoint - 0xFEE0).toLowerCase();
|
||||||
|
// Fullwidth space -> Halfwidth space
|
||||||
|
else if (codePoint === /* ' ' */ 0x3000) return ' ';
|
||||||
|
// Halfwidth kana (U+FF66 - U+FF9D) -> Fullwidth kana
|
||||||
|
else if (codePoint >= 0xFF66 && codePoint <= 0xFF9D) return HALF_TO_FULL_KANA[char] ?? char;
|
||||||
|
else if (codePoint === /* '。' */ 0xFF61) return '。';
|
||||||
|
else if (codePoint === /* '「' */ 0xFF62) return '「';
|
||||||
|
else if (codePoint === /* '」' */ 0xFF63) return '」';
|
||||||
|
else if (codePoint === /* '、' */ 0xFF64) return '、';
|
||||||
|
else if (codePoint === /* '・' */ 0xFF65) return '・';
|
||||||
|
else if (codePoint === /* '゙' */ 0xFF9E || codePoint === /* '゛' */ 0x309B) return '\u3099'; // -> COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
|
||||||
|
else if (codePoint === /* '゚' */ 0xFF9F || codePoint === /* '゜' */ 0x309C) return '\u309A'; // -> COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||||
|
else return char.toLowerCase();
|
||||||
|
};
|
||||||
|
|
||||||
|
const HALF_TO_FULL_KANA: Record<string, string> = {
|
||||||
|
'ヲ': 'ヲ', 'ァ': 'ァ', 'ィ': 'ィ', 'ゥ': 'ゥ', 'ェ': 'ェ', 'ォ': 'ォ',
|
||||||
|
'ャ': 'ャ', 'ュ': 'ュ', 'ョ': 'ョ', 'ッ': 'ッ',
|
||||||
|
'ー': 'ー',
|
||||||
|
'ア': 'ア', 'イ': 'イ', 'ウ': 'ウ', 'エ': 'エ', 'オ': 'オ',
|
||||||
|
'カ': 'カ', 'キ': 'キ', 'ク': 'ク', 'ケ': 'ケ', 'コ': 'コ',
|
||||||
|
'サ': 'サ', 'シ': 'シ', 'ス': 'ス', 'セ': 'セ', 'ソ': 'ソ',
|
||||||
|
'タ': 'タ', 'チ': 'チ', 'ツ': 'ツ', 'テ': 'テ', 'ト': 'ト',
|
||||||
|
'ナ': 'ナ', 'ニ': 'ニ', 'ヌ': 'ヌ', 'ネ': 'ネ', 'ノ': 'ノ',
|
||||||
|
'ハ': 'ハ', 'ヒ': 'ヒ', 'フ': 'フ', 'ヘ': 'ヘ', 'ホ': 'ホ',
|
||||||
|
'マ': 'マ', 'ミ': 'ミ', 'ム': 'ム', 'メ': 'メ', 'モ': 'モ',
|
||||||
|
'ヤ': 'ヤ', 'ユ': 'ユ', 'ヨ': 'ヨ',
|
||||||
|
'ラ': 'ラ', 'リ': 'リ', 'ル': 'ル', 'レ': 'レ', 'ロ': 'ロ',
|
||||||
|
'ワ': 'ワ', 'ン': 'ン',
|
||||||
|
};
|
||||||
|
|
||||||
|
const isHiraganaRange = (charCode: number) => (charCode >= 0x3041 && charCode <= 0x3096) || (charCode >= 0x309D && charCode <= 0x309E);
|
||||||
|
export const toKatakanaSingle = (char: string) => {
|
||||||
|
const code = char.charCodeAt(0);
|
||||||
|
return isHiraganaRange(code) ? String.fromCharCode(code + 0x60) : char;
|
||||||
|
};
|
||||||
|
export const toKatakana = (string: string) => [...string].map(toKatakanaSingle).join('');
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
export interface TrieNode {
|
||||||
|
parent: TrieNode | undefined;
|
||||||
|
children: Map<number, TrieNode>; // Unicode code point -> child node
|
||||||
|
tokenIds: number[];
|
||||||
|
subTreeTokenIds: number[]; // Empty on root. Will Uint16Array be faster?
|
||||||
|
}
|
||||||
|
|
||||||
|
export const traverseTrieStep = (node: TrieNode | undefined, codePoint: string, ignorableCodePoints?: RegExp) =>
|
||||||
|
node?.children.get(codePoint.codePointAt(0)!) ?? (ignorableCodePoints?.test(codePoint) ? node : undefined);
|
||||||
|
export const traverseTrie = (node: TrieNode | undefined, text: string, ignorableCodePoints?: RegExp) => {
|
||||||
|
if (!node) return;
|
||||||
|
for (const codePoint of text) {
|
||||||
|
node = traverseTrieStep(node, codePoint, ignorableCodePoints);
|
||||||
|
if (!node) return;
|
||||||
|
}
|
||||||
|
return node;
|
||||||
|
};
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
export enum TokenType {
|
||||||
|
Raw,
|
||||||
|
Kana,
|
||||||
|
Romaji,
|
||||||
|
Han,
|
||||||
|
Pinyin,
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TokenDefinition {
|
||||||
|
id: number;
|
||||||
|
type: TokenType;
|
||||||
|
text: string;
|
||||||
|
codePointLength: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// [start, end)
|
||||||
|
export interface OffsetSpan {
|
||||||
|
start: number;
|
||||||
|
end: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type CompressedInvertedIndex = {
|
||||||
|
documents: string[];
|
||||||
|
tokenTypes: TokenType[];
|
||||||
|
tokenReferences: number[][][]; // tokenId -> [documentId, start1, end1, start2, end2, ...][]
|
||||||
|
tries: {
|
||||||
|
romaji: number[];
|
||||||
|
kana: number[];
|
||||||
|
other: number[];
|
||||||
|
};
|
||||||
|
};
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
import type { OffsetSpan } from './types';
|
||||||
|
|
||||||
|
export const getSpanLength = (offset: OffsetSpan) => offset.end - offset.start;
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
import path from 'node:path';
|
||||||
|
import url from 'node:url';
|
||||||
|
|
||||||
|
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||||
|
|
||||||
|
import { buildInvertedIndex, type KuromojiTokenizer } from '../indexer';
|
||||||
|
import { highlightSearchResult, loadInvertedIndex, searchInvertedIndex } from '../searcher';
|
||||||
|
|
||||||
|
let kuromoji: KuromojiTokenizer;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||||
|
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('search', () => {
|
||||||
|
const testDocuments = [
|
||||||
|
'ミーティア',
|
||||||
|
'エンドマークに希望と涙を添えて',
|
||||||
|
'宵の鳥',
|
||||||
|
'僕の和風本当上手',
|
||||||
|
];
|
||||||
|
|
||||||
|
it('should match with mixed search query', () => {
|
||||||
|
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
|
||||||
|
const invertedIndex = loadInvertedIndex(compressed);
|
||||||
|
|
||||||
|
const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
|
||||||
|
|
||||||
|
// Should have at least one result
|
||||||
|
expect(results.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
// The first result should be "僕の和風本当上手"
|
||||||
|
expect(results[0]!.documentText).toBe('僕の和風本当上手');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should highlight search result correctly', () => {
|
||||||
|
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
|
||||||
|
const invertedIndex = loadInvertedIndex(compressed);
|
||||||
|
|
||||||
|
const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
|
||||||
|
expect(results.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
const highlighted = highlightSearchResult(results[0]!);
|
||||||
|
|
||||||
|
// Should be an array of parts
|
||||||
|
expect(Array.isArray(highlighted)).toBe(true);
|
||||||
|
expect(highlighted.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
// Collect highlighted text
|
||||||
|
const highlightedTexts = highlighted
|
||||||
|
.filter((part): part is { highlight: string } => typeof part !== 'string')
|
||||||
|
.map(part => part.highlight);
|
||||||
|
|
||||||
|
expect(highlightedTexts.some(text => text.includes('僕'))).toBe(true);
|
||||||
|
expect(highlightedTexts.some(text => text.includes('の'))).toBe(true);
|
||||||
|
expect(highlightedTexts.some(text => text.includes('和'))).toBe(true);
|
||||||
|
expect(highlightedTexts.some(text => text.includes('風'))).toBe(true);
|
||||||
|
expect(highlightedTexts.some(text => text.includes('上'))).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match romaji input to kana documents', () => {
|
||||||
|
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
|
||||||
|
const invertedIndex = loadInvertedIndex(compressed);
|
||||||
|
|
||||||
|
// Search for "yoi" should match "宵の鳥"
|
||||||
|
const results = searchInvertedIndex(invertedIndex, 'yoi');
|
||||||
|
const matchedTexts = results.map(r => r.documentText);
|
||||||
|
|
||||||
|
expect(matchedTexts).toContain('宵の鳥');
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,111 @@
|
|||||||
|
import { traverseTrie } from '../common';
|
||||||
|
import { buildTrie, serializeTrie } from '../indexer/trie';
|
||||||
|
import { deserializeTrie } from '../searcher/trie';
|
||||||
|
|
||||||
|
describe('Trie building', () => {
|
||||||
|
it('should build a Trie with multiple different tokens', () => {
|
||||||
|
const trie = buildTrie([
|
||||||
|
[0, 'hello'],
|
||||||
|
[1, 'help'],
|
||||||
|
[2, 'world'],
|
||||||
|
[3, 'word'],
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Traverse to verify structure
|
||||||
|
const helloNode = traverseTrie(trie, 'hello');
|
||||||
|
const helpNode = traverseTrie(trie, 'help');
|
||||||
|
const worldNode = traverseTrie(trie, 'world');
|
||||||
|
const wordNode = traverseTrie(trie, 'word');
|
||||||
|
|
||||||
|
expect(helloNode).toBeDefined();
|
||||||
|
expect(helpNode).toBeDefined();
|
||||||
|
expect(worldNode).toBeDefined();
|
||||||
|
expect(wordNode).toBeDefined();
|
||||||
|
|
||||||
|
// Check token IDs
|
||||||
|
expect(helloNode!.tokenIds).toContain(0);
|
||||||
|
expect(helpNode!.tokenIds).toContain(1);
|
||||||
|
expect(worldNode!.tokenIds).toContain(2);
|
||||||
|
expect(wordNode!.tokenIds).toContain(3);
|
||||||
|
|
||||||
|
// Check that 'hel' prefix node has both tokens in subTree
|
||||||
|
const helNode = traverseTrie(trie, 'hel');
|
||||||
|
expect(helNode).toBeDefined();
|
||||||
|
expect(helNode!.subTreeTokenIds).toContain(0);
|
||||||
|
expect(helNode!.subTreeTokenIds).toContain(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle Japanese text tokens', () => {
|
||||||
|
const trie = buildTrie([
|
||||||
|
[0, 'さくら'],
|
||||||
|
[1, 'サクラ'],
|
||||||
|
[2, '桜'],
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(traverseTrie(trie, 'さくら')?.tokenIds).toContain(0);
|
||||||
|
expect(traverseTrie(trie, 'サクラ')?.tokenIds).toContain(1);
|
||||||
|
expect(traverseTrie(trie, '桜')?.tokenIds).toContain(2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Trie serialization and deserialization', () => {
|
||||||
|
it('should serialize and deserialize a Trie correctly', () => {
|
||||||
|
const originalTrie = buildTrie([
|
||||||
|
[0, 'apple'],
|
||||||
|
[1, 'app'],
|
||||||
|
[2, 'banana'],
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Serialize
|
||||||
|
const serialized = serializeTrie(originalTrie);
|
||||||
|
expect(Array.isArray(serialized)).toBe(true);
|
||||||
|
expect(serialized.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
// Deserialize
|
||||||
|
const { root: deserializedTrie, tokenCodePoints } = deserializeTrie(serialized);
|
||||||
|
|
||||||
|
// Verify structure is preserved
|
||||||
|
const appleNode = traverseTrie(deserializedTrie, 'apple');
|
||||||
|
const appNode = traverseTrie(deserializedTrie, 'app');
|
||||||
|
const bananaNode = traverseTrie(deserializedTrie, 'banana');
|
||||||
|
|
||||||
|
expect(appleNode).toBeDefined();
|
||||||
|
expect(appNode).toBeDefined();
|
||||||
|
expect(bananaNode).toBeDefined();
|
||||||
|
|
||||||
|
expect(appleNode!.tokenIds).toContain(0);
|
||||||
|
expect(appNode!.tokenIds).toContain(1);
|
||||||
|
expect(bananaNode!.tokenIds).toContain(2);
|
||||||
|
|
||||||
|
// Verify tokenCodePoints map
|
||||||
|
expect(tokenCodePoints.get(0)?.join('')).toBe('apple');
|
||||||
|
expect(tokenCodePoints.get(1)?.join('')).toBe('app');
|
||||||
|
expect(tokenCodePoints.get(2)?.join('')).toBe('banana');
|
||||||
|
|
||||||
|
// Verify subTreeTokenIds are reconstructed
|
||||||
|
expect(appNode!.subTreeTokenIds).toContain(0);
|
||||||
|
expect(appNode!.subTreeTokenIds).toContain(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should preserve parent references after deserialization', () => {
|
||||||
|
const originalTrie = buildTrie([
|
||||||
|
[0, 'test'],
|
||||||
|
]);
|
||||||
|
|
||||||
|
const serialized = serializeTrie(originalTrie);
|
||||||
|
const { root } = deserializeTrie(serialized);
|
||||||
|
|
||||||
|
const testNode = traverseTrie(root, 'test');
|
||||||
|
expect(testNode).toBeDefined();
|
||||||
|
|
||||||
|
// Walk back to root via parent references
|
||||||
|
let node = testNode;
|
||||||
|
let depth = 0;
|
||||||
|
while (node?.parent) {
|
||||||
|
node = node.parent;
|
||||||
|
depth++;
|
||||||
|
}
|
||||||
|
expect(depth).toBe(4); // 't' -> 'e' -> 's' -> 't' -> root
|
||||||
|
expect(node).toBe(root);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
export * from './common';
|
||||||
|
export * from './indexer';
|
||||||
|
export * from './searcher';
|
||||||
@@ -0,0 +1,103 @@
|
|||||||
|
import { getHanVariants, getPinyinCandidates, isHanCharacter, unionFindSet } from './han';
|
||||||
|
|
||||||
|
describe('unionFindSet', () => {
|
||||||
|
it('should find self as root initially', () => {
|
||||||
|
const ufs = unionFindSet<number>();
|
||||||
|
expect(ufs.find(1)).toBe(1);
|
||||||
|
expect(ufs.find(2)).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should union two elements', () => {
|
||||||
|
const ufs = unionFindSet<number>();
|
||||||
|
ufs.union(1, 2);
|
||||||
|
expect(ufs.find(1)).toBe(ufs.find(2));
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should union multiple elements transitively', () => {
|
||||||
|
const ufs = unionFindSet<number>();
|
||||||
|
ufs.union(1, 2);
|
||||||
|
ufs.union(2, 3);
|
||||||
|
ufs.union(4, 5);
|
||||||
|
expect(ufs.find(1)).toBe(ufs.find(3));
|
||||||
|
expect(ufs.find(1)).not.toBe(ufs.find(4));
|
||||||
|
ufs.union(3, 4);
|
||||||
|
expect(ufs.find(1)).toBe(ufs.find(5));
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should iterate all keys', () => {
|
||||||
|
const ufs = unionFindSet<string>();
|
||||||
|
ufs.union('a', 'b');
|
||||||
|
ufs.union('c', 'd');
|
||||||
|
const keys = [...ufs.keys()];
|
||||||
|
expect(keys).toContain('a');
|
||||||
|
expect(keys).toContain('b');
|
||||||
|
expect(keys).toContain('c');
|
||||||
|
expect(keys).toContain('d');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('isHanCharacter', () => {
|
||||||
|
it('should return true for CJK characters', () => {
|
||||||
|
expect(isHanCharacter('中')).toBe(true);
|
||||||
|
expect(isHanCharacter('国')).toBe(true);
|
||||||
|
expect(isHanCharacter('日')).toBe(true);
|
||||||
|
expect(isHanCharacter('本')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return false for non-CJK characters', () => {
|
||||||
|
expect(isHanCharacter('a')).toBe(false);
|
||||||
|
expect(isHanCharacter('あ')).toBe(false);
|
||||||
|
expect(isHanCharacter('ア')).toBe(false);
|
||||||
|
expect(isHanCharacter('1')).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getHanVariants', () => {
|
||||||
|
it('should return variants for simplified/traditional characters', () => {
|
||||||
|
// 国 (simplified) and 國 (traditional) should be variants of each other
|
||||||
|
const variants1 = getHanVariants('国');
|
||||||
|
const variants2 = getHanVariants('國');
|
||||||
|
expect(variants1).toContain('国');
|
||||||
|
expect(variants1).toContain('國');
|
||||||
|
expect(variants2).toContain('国');
|
||||||
|
expect(variants2).toContain('國');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return the character itself for characters without variants', () => {
|
||||||
|
const variants = getHanVariants('一');
|
||||||
|
expect(variants).toContain('一');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return empty array for non-Han characters', () => {
|
||||||
|
expect(getHanVariants('a')).toEqual([]);
|
||||||
|
expect(getHanVariants('あ')).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getPinyinCandidates', () => {
|
||||||
|
it('should return pinyin for a Han character', () => {
|
||||||
|
const candidates = getPinyinCandidates('中');
|
||||||
|
expect(candidates).toContain('zhong');
|
||||||
|
expect(candidates).toContain('zh'); // initial
|
||||||
|
expect(candidates).toContain('z'); // first letter
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return multiple pinyin for polyphonic characters', () => {
|
||||||
|
// 行 can be "xing" or "hang"
|
||||||
|
const candidates = getPinyinCandidates('行');
|
||||||
|
expect(candidates).toContain('xing');
|
||||||
|
expect(candidates).toContain('hang');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include fuzzy pinyin variants', () => {
|
||||||
|
// 风 is "feng", should also have fuzzy variant "fen"
|
||||||
|
const candidates = getPinyinCandidates('风');
|
||||||
|
expect(candidates).toContain('feng');
|
||||||
|
expect(candidates).toContain('fen'); // fuzzy: eng -> en
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return empty array for non-Han characters', () => {
|
||||||
|
expect(getPinyinCandidates('a')).toEqual([]);
|
||||||
|
expect(getPinyinCandidates('あ')).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import hkVariants from 'opencc-js/dict/HKVariants';
|
||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import hkVariantsRev from 'opencc-js/dict/HKVariantsRev';
|
||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import jpVariants from 'opencc-js/dict/JPVariants';
|
||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import jpVariantsRev from 'opencc-js/dict/JPVariantsRev';
|
||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import stCharacters from 'opencc-js/dict/STCharacters';
|
||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import tsCharacters from 'opencc-js/dict/TSCharacters';
|
||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import twVariants from 'opencc-js/dict/TWVariants';
|
||||||
|
// @ts-expect-error No declaration file
|
||||||
|
import twVariantsRev from 'opencc-js/dict/TWVariantsRev';
|
||||||
|
import { polyphonic } from 'pinyin-pro';
|
||||||
|
|
||||||
|
export const unionFindSet = <T>() => {
|
||||||
|
const parent = new Map<T, T>();
|
||||||
|
const rank = new Map<T, number>();
|
||||||
|
const find = (x: T): T => {
|
||||||
|
const p = parent.get(x);
|
||||||
|
if (p == null) {
|
||||||
|
parent.set(x, x);
|
||||||
|
return x;
|
||||||
|
} else if (p === x) return x;
|
||||||
|
else {
|
||||||
|
const root = find(p);
|
||||||
|
parent.set(x, root);
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
const union = (x: T, y: T) => {
|
||||||
|
x = find(x);
|
||||||
|
y = find(y);
|
||||||
|
if (x === y) return;
|
||||||
|
const rankX = rank.get(x) ?? 0, rankY = rank.get(y) ?? 0;
|
||||||
|
if (rankX < rankY) parent.set(x, y);
|
||||||
|
else if (rankX > rankY) parent.set(y, x);
|
||||||
|
else {
|
||||||
|
parent.set(y, x);
|
||||||
|
rank.set(x, rankX + 1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
const keys = () => parent.keys();
|
||||||
|
return { find, union, keys };
|
||||||
|
};
|
||||||
|
|
||||||
|
const exchangeMap = (() => {
|
||||||
|
const ufs = unionFindSet<string>();
|
||||||
|
for (const dict of [hkVariants, hkVariantsRev, jpVariants, jpVariantsRev, stCharacters, tsCharacters, twVariants, twVariantsRev] as string[]) {
|
||||||
|
for (const [from, to] of dict.split('|').map(pair => pair.split(' '))) {
|
||||||
|
if (!from || !to || [...from].length !== 1 || [...to].length !== 1) continue;
|
||||||
|
ufs.union(from, to);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const map = new Map<string, string[]>();
|
||||||
|
for (const key of ufs.keys()) {
|
||||||
|
const root = ufs.find(key);
|
||||||
|
let list = map.get(root);
|
||||||
|
if (!list) map.set(root, list = []);
|
||||||
|
if (key !== root) map.set(key, list);
|
||||||
|
list.push(key);
|
||||||
|
}
|
||||||
|
for (const list of map.values()) list.sort();
|
||||||
|
return map;
|
||||||
|
})();
|
||||||
|
|
||||||
|
export const isHanCharacter = (phrase: string) => /^[\p{Script=Han}]+$/u.test(phrase);
|
||||||
|
|
||||||
|
export const getHanVariants = (character: string) => exchangeMap.get(character) ?? (isHanCharacter(character) ? [character] : []);
|
||||||
|
|
||||||
|
const PINYIN_INITIALS: string[] = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w'];
|
||||||
|
const PINYIN_FINALS_FUZZY_MAP: Record<string, string> = { 'ang': 'an', 'eng': 'en', 'ing': 'in' };
|
||||||
|
export const getPinyinCandidates = (character: string) => {
|
||||||
|
const pinyins = polyphonic(character, { type: 'array', toneType: 'none', removeNonZh: true })[0] ?? [];
|
||||||
|
return Array.from(new Set(pinyins.filter(fullPinyin => fullPinyin).flatMap(fullPinyin => {
|
||||||
|
const initial = PINYIN_INITIALS.find(initial => fullPinyin.startsWith(initial));
|
||||||
|
const initialAlphabet = initial?.[0] ?? fullPinyin[0]!;
|
||||||
|
const fuzzySuffix = fullPinyin.slice(-3);
|
||||||
|
const fuzzyPinyin = fuzzySuffix in PINYIN_FINALS_FUZZY_MAP ? fullPinyin.slice(0, -3) + PINYIN_FINALS_FUZZY_MAP[fuzzySuffix] : undefined;
|
||||||
|
return [fullPinyin, initial, initialAlphabet, fuzzyPinyin].filter((s): s is string => !!s);
|
||||||
|
})));
|
||||||
|
};
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
export * from './han';
|
||||||
|
export * from './japanese';
|
||||||
|
export * from './tokenizer';
|
||||||
|
export * from './trie';
|
||||||
|
export * from './inverted-index';
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
import { NORMALIZE_RULES_KANA_DAKUTEN, NORMALIZE_RULES_ROMAJI } from './japanese';
|
||||||
|
import { createTokenizer, type TokenizerOptions } from './tokenizer';
|
||||||
|
import { buildTrie, graftTriePaths, serializeTrie } from './trie';
|
||||||
|
import type { CompressedInvertedIndex, TokenDefinition } from '../common/types';
|
||||||
|
import { TokenType } from '../common/types';
|
||||||
|
|
||||||
|
const buildTypedTrie = (tokens: TokenDefinition[], typePredicate: (tokenType: TokenType) => boolean) =>
|
||||||
|
buildTrie(tokens.filter(token => typePredicate(token.type)).map(token => [token.id, token.text]));
|
||||||
|
|
||||||
|
export const buildInvertedIndex = (documents: string[], tokenizerOptions: TokenizerOptions) => {
|
||||||
|
const tokenizer = createTokenizer(tokenizerOptions);
|
||||||
|
const documentTokens = documents.map(document => tokenizer.tokenize(document));
|
||||||
|
|
||||||
|
const tokenDefinitions = [...tokenizer.tokens.values()];
|
||||||
|
const romajiRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Romaji);
|
||||||
|
const kanaRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Kana);
|
||||||
|
const otherRoot = buildTypedTrie(tokenDefinitions, type => type !== TokenType.Romaji && type !== TokenType.Kana);
|
||||||
|
graftTriePaths(romajiRoot, NORMALIZE_RULES_ROMAJI);
|
||||||
|
graftTriePaths(kanaRoot, NORMALIZE_RULES_KANA_DAKUTEN);
|
||||||
|
|
||||||
|
const invertedIndex: CompressedInvertedIndex = {
|
||||||
|
documents,
|
||||||
|
tokenTypes: tokenDefinitions.map(token => token.type),
|
||||||
|
tokenReferences: Array.from({ length: tokenDefinitions.length }, () => []),
|
||||||
|
tries: {
|
||||||
|
romaji: serializeTrie(romajiRoot),
|
||||||
|
kana: serializeTrie(kanaRoot),
|
||||||
|
other: serializeTrie(otherRoot),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
for (const [documentId, tokens] of documentTokens.entries()) {
|
||||||
|
const tokenOccurrences = new Map<number, number[]>();
|
||||||
|
for (const token of tokens) {
|
||||||
|
let occurrences = tokenOccurrences.get(token.id);
|
||||||
|
if (!occurrences) {
|
||||||
|
occurrences = [];
|
||||||
|
tokenOccurrences.set(token.id, occurrences);
|
||||||
|
}
|
||||||
|
occurrences.push(token.start, token.end);
|
||||||
|
}
|
||||||
|
for (const [tokenId, occurrences] of tokenOccurrences) {
|
||||||
|
invertedIndex.tokenReferences[tokenId]!.push([documentId, ...occurrences]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return invertedIndex;
|
||||||
|
};
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
import path from 'node:path';
|
||||||
|
import url from 'node:url';
|
||||||
|
|
||||||
|
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||||
|
|
||||||
|
import { getAllKanaReadings, toRomajiStrictly } from './japanese';
|
||||||
|
import type { KuromojiTokenizer } from './tokenizer';
|
||||||
|
|
||||||
|
let kuromoji: KuromojiTokenizer;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||||
|
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('toRomajiStrictly', () => {
|
||||||
|
it('should convert basic kana to romaji', () => {
|
||||||
|
expect(toRomajiStrictly('あ')).toBe('a');
|
||||||
|
expect(toRomajiStrictly('か')).toBe('ka');
|
||||||
|
expect(toRomajiStrictly('さくら')).toBe('sakura');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should convert katakana to romaji', () => {
|
||||||
|
expect(toRomajiStrictly('ア')).toBe('a');
|
||||||
|
expect(toRomajiStrictly('カ')).toBe('ka');
|
||||||
|
expect(toRomajiStrictly('サクラ')).toBe('sakura');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle long vowels', () => {
|
||||||
|
expect(toRomajiStrictly('おう')).toBe('ou');
|
||||||
|
expect(toRomajiStrictly('おお')).toBe('oo');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return empty string for invalid first character', () => {
|
||||||
|
expect(toRomajiStrictly('ー')).toBe(''); // prolonged sound mark cannot be first
|
||||||
|
expect(toRomajiStrictly('ゃ')).toBe(''); // small ya cannot be first
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return empty string for invalid last character', () => {
|
||||||
|
expect(toRomajiStrictly('っ')).toBe(''); // small tsu cannot be last
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle gemination (small tsu)', () => {
|
||||||
|
expect(toRomajiStrictly('かった')).toBe('katta');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getAllKanaReadings', () => {
|
||||||
|
it('should return katakana reading for pure kana input', () => {
|
||||||
|
const readings = getAllKanaReadings(kuromoji, 'あ');
|
||||||
|
expect(readings).toContain('ア');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return readings for kanji', () => {
|
||||||
|
const readings = getAllKanaReadings(kuromoji, '僕');
|
||||||
|
expect(readings.length).toBeGreaterThan(0);
|
||||||
|
// 僕 should have reading ボク
|
||||||
|
expect(readings).toContain('ボク');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return readings for compound words', () => {
|
||||||
|
const readings = getAllKanaReadings(kuromoji, '和風');
|
||||||
|
expect(readings.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,158 @@
|
|||||||
|
import { fromKana } from 'hepburn';
|
||||||
|
|
||||||
|
import type { KuromojiTokenizer } from './tokenizer';
|
||||||
|
import { toKatakana } from '../common';
|
||||||
|
|
||||||
|
// We have normalized all other sound marks to \u3099 and \u309A (combining kata-hiragana voiced/semi-voiced sound marks)
|
||||||
|
export const isMaybeJapanese = (phrase: string) => /^[\p{Script=Han}\u3041-\u309F\u30A0-\u30FF\u3005\u3006\u30FC\u3099\u309A]+$/u.test(phrase);
|
||||||
|
|
||||||
|
// See also normalize.ts
|
||||||
|
export const isJapaneseSoundMark = (phrase: string) => /^[\u3099\u309A]+$/.test(phrase);
|
||||||
|
export const stripJapaneseSoundMarks = (phrase: string) => phrase.replaceAll('\u3099', '').replaceAll('\u309A', '');
|
||||||
|
|
||||||
|
export const isKanaSingle = (char: string) => {
|
||||||
|
const code = char.charCodeAt(0);
|
||||||
|
return (code >= 0x3041 && code <= 0x309F) || (code >= 0x30A0 && code <= 0x30FF);
|
||||||
|
};
|
||||||
|
export const isKana = (phrase: string) => [...phrase].every(isKanaSingle);
|
||||||
|
|
||||||
|
const KANAS_CANNOT_BE_FIRST = [
|
||||||
|
'ァ', 'ィ', 'ゥ', 'ェ', 'ォ',
|
||||||
|
'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ',
|
||||||
|
'ャ', 'ュ', 'ョ',
|
||||||
|
'ゃ', 'ゅ', 'ょ',
|
||||||
|
'ヮ', 'ゎ',
|
||||||
|
'ㇰ', 'ㇱ', 'ㇲ', 'ㇳ', 'ㇴ', 'ㇵ', 'ㇶ', 'ㇷ', 'ㇸ', 'ㇹ', 'ㇺ', 'ㇻ', 'ㇼ', 'ㇽ', 'ㇾ', 'ㇿ',
|
||||||
|
'ー',
|
||||||
|
];
|
||||||
|
const KANAS_CANNOT_BE_LAST = [
|
||||||
|
'ッ', 'っ',
|
||||||
|
];
|
||||||
|
export const toRomajiStrictly = (kana: string) => {
|
||||||
|
if (KANAS_CANNOT_BE_FIRST.includes(kana[0]!)) return '';
|
||||||
|
if (KANAS_CANNOT_BE_LAST.includes(kana[kana.length - 1]!)) return '';
|
||||||
|
const romaji = fromKana(kana).toLowerCase()
|
||||||
|
.replaceAll('ā', 'aa')
|
||||||
|
.replaceAll('ī', 'ii')
|
||||||
|
.replaceAll('ū', 'uu')
|
||||||
|
.replaceAll('ē', 'ee')
|
||||||
|
.replaceAll('ō', 'ou');
|
||||||
|
if (!romaji.match(/^[a-z]+$/)) return '';
|
||||||
|
return romaji;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const createTranscriptionEnumerator = (
|
||||||
|
isValidPhrase: (codePoints: string[], start: number, length: number) => boolean,
|
||||||
|
getAllTranscriptions: (phrase: string) => string[],
|
||||||
|
) => (codePoints: string[]) => {
|
||||||
|
const toKey = (start: number, length: number) => `${start}:${length}`;
|
||||||
|
const resultMap = new Map<string, { start: number; length: number; transcriptions: string[] }>();
|
||||||
|
for (let phraseLength = 1; phraseLength <= codePoints.length; phraseLength++) for (let start = 0; start + phraseLength <= codePoints.length; start++) {
|
||||||
|
if (!isValidPhrase(codePoints, start, phraseLength)) continue;
|
||||||
|
const phrase = codePoints.slice(start, start + phraseLength).join('');
|
||||||
|
const atomicTranscriptions = [...new Set(getAllTranscriptions(phrase))].filter(candidateTranscription => {
|
||||||
|
if (!candidateTranscription) return false;
|
||||||
|
// Ensure the transcription is atomic (not a combination of multiple shorter transcriptions, separated by any midpoints)
|
||||||
|
type State = { phrasePosition: number; transcriptionPosition: number };
|
||||||
|
const toStateKey = (state: State) => `${state.phrasePosition}:${state.transcriptionPosition}`;
|
||||||
|
const visitedStates = new Set<string>();
|
||||||
|
const queue: State[] = [{ phrasePosition: 0, transcriptionPosition: 0 }];
|
||||||
|
while (queue.length > 0) {
|
||||||
|
const { phrasePosition, transcriptionPosition } = queue.shift()!;
|
||||||
|
for (let prefixLength = 1; prefixLength <= phraseLength - phrasePosition; prefixLength++) {
|
||||||
|
const prefixResult = resultMap.get(toKey(start + phrasePosition, prefixLength));
|
||||||
|
if (!prefixResult) continue;
|
||||||
|
for (const transcription of prefixResult.transcriptions) {
|
||||||
|
if (candidateTranscription.slice(transcriptionPosition, transcriptionPosition + transcription.length) === transcription) {
|
||||||
|
const nextState: State = { phrasePosition: phrasePosition + prefixLength, transcriptionPosition: transcriptionPosition + transcription.length };
|
||||||
|
if (nextState.phrasePosition === phraseLength && nextState.transcriptionPosition === candidateTranscription.length) return false; // Found a valid combination
|
||||||
|
if (visitedStates.has(toStateKey(nextState))) continue;
|
||||||
|
visitedStates.add(toStateKey(nextState));
|
||||||
|
queue.push(nextState);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
if (atomicTranscriptions.length > 0) resultMap.set(toKey(start, phraseLength), { start, length: phraseLength, transcriptions: atomicTranscriptions });
|
||||||
|
}
|
||||||
|
return [...resultMap.values()];
|
||||||
|
};
|
||||||
|
|
||||||
|
export const getAllKanaReadings = (kuromoji: KuromojiTokenizer, phrase: string) => Array.from(new Set(
|
||||||
|
[
|
||||||
|
...isKana(phrase) ? [toKatakana(phrase)] : [],
|
||||||
|
...isKana(phrase) && [...phrase].length === 1 ? [] : ((kuromoji.token_info_dictionary.target_map[kuromoji.viterbi_builder.trie.lookup(phrase)] ?? [])
|
||||||
|
.map(id => kuromoji.formatter.formatEntry(
|
||||||
|
id, 0, 'KNOWN',
|
||||||
|
kuromoji.token_info_dictionary.getFeatures(id as unknown as string)?.split(',') ?? [],
|
||||||
|
).reading)
|
||||||
|
.filter((reading): reading is string => !!reading))
|
||||||
|
.map(toKatakana),
|
||||||
|
],
|
||||||
|
));
|
||||||
|
|
||||||
|
const createNormalizer = (rules: Record<string, string>) => (text: string) => {
|
||||||
|
while (true) {
|
||||||
|
const beforeCurrentIteration = text;
|
||||||
|
for (const [from, to] of Object.entries(rules)) text = text.replaceAll(from, to);
|
||||||
|
if (text === beforeCurrentIteration) break;
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const NORMALIZE_RULES_ROMAJI: Record<string, string> = {
|
||||||
|
// Remove all long vowels (sa-ba- -> saba)
|
||||||
|
'-': '',
|
||||||
|
// Collapse consecutive vowels
|
||||||
|
'aa': 'a',
|
||||||
|
'ii': 'i',
|
||||||
|
'uu': 'u',
|
||||||
|
'ee': 'e',
|
||||||
|
'oo': 'o',
|
||||||
|
'ou': 'o',
|
||||||
|
// mb/mp/mm -> nb/np/nm (shimbun -> shinbun)
|
||||||
|
'mb': 'nb',
|
||||||
|
'mp': 'np',
|
||||||
|
'mm': 'nm',
|
||||||
|
// Others
|
||||||
|
'sha': 'sya',
|
||||||
|
'tsu': 'tu',
|
||||||
|
'chi': 'ti',
|
||||||
|
'shi': 'si',
|
||||||
|
'ji': 'zi',
|
||||||
|
};
|
||||||
|
export const normalizeRomaji = createNormalizer(NORMALIZE_RULES_ROMAJI);
|
||||||
|
|
||||||
|
export const NORMALIZE_RULES_KANA_DAKUTEN: Record<string, string> = {
|
||||||
|
'う\u3099': 'ゔ',
|
||||||
|
'か\u3099': 'が', 'き\u3099': 'ぎ', 'く\u3099': 'ぐ', 'け\u3099': 'げ', 'こ\u3099': 'ご',
|
||||||
|
'さ\u3099': 'ざ', 'し\u3099': 'じ', 'す\u3099': 'ず', 'せ\u3099': 'ぜ', 'そ\u3099': 'ぞ',
|
||||||
|
'た\u3099': 'だ', 'ち\u3099': 'ぢ', 'つ\u3099': 'づ', 'て\u3099': 'で', 'と\u3099': 'ど',
|
||||||
|
'は\u3099': 'ば', 'ひ\u3099': 'び', 'ふ\u3099': 'ぶ', 'へ\u3099': 'べ', 'ほ\u3099': 'ぼ',
|
||||||
|
'は\u309A': 'ぱ', 'ひ\u309A': 'ぴ', 'ふ\u309A': 'ぷ', 'へ\u309A': 'ぺ', 'ほ\u309A': 'ぽ',
|
||||||
|
'ゝ\u3099': 'ゞ',
|
||||||
|
|
||||||
|
'ウ\u3099': 'ヴ',
|
||||||
|
'カ\u3099': 'ガ', 'キ\u3099': 'ギ', 'ク\u3099': 'グ', 'ケ\u3099': 'ゲ', 'コ\u3099': 'ゴ',
|
||||||
|
'サ\u3099': 'ザ', 'シ\u3099': 'ジ', 'ス\u3099': 'ズ', 'セ\u3099': 'ゼ', 'ソ\u3099': 'ゾ',
|
||||||
|
'タ\u3099': 'ダ', 'チ\u3099': 'ヂ', 'ツ\u3099': 'ヅ', 'テ\u3099': 'デ', 'ト\u3099': 'ド',
|
||||||
|
'ハ\u3099': 'バ', 'ヒ\u3099': 'ビ', 'フ\u3099': 'ブ', 'ヘ\u3099': 'ベ', 'ホ\u3099': 'ボ',
|
||||||
|
'ハ\u309A': 'パ', 'ヒ\u309A': 'ピ', 'フ\u309A': 'プ', 'ヘ\u309A': 'ペ', 'ホ\u309A': 'ポ',
|
||||||
|
'ワ\u3099': 'ヷ', 'ヰ\u3099': 'ヸ', 'ヱ\u3099': 'ヹ', 'ヲ\u3099': 'ヺ',
|
||||||
|
'ヽ\u3099': 'ヾ',
|
||||||
|
};
|
||||||
|
export const normalizeKanaDakuten = createNormalizer(NORMALIZE_RULES_KANA_DAKUTEN);
|
||||||
|
|
||||||
|
const isValidJapanesePhrase = (codePoints: string[], start: number, length: number) =>
|
||||||
|
// Skip splittings that cause sound marks to occur in the first position of a phrase
|
||||||
|
!isJapaneseSoundMark(codePoints[start]!) && (start + length === codePoints.length || !isJapaneseSoundMark(codePoints[start + length]!));
|
||||||
|
export const createKanaTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
|
||||||
|
isValidJapanesePhrase,
|
||||||
|
phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))),
|
||||||
|
);
|
||||||
|
export const createRomajiTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
|
||||||
|
isValidJapanesePhrase,
|
||||||
|
phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))).map(kana => normalizeRomaji(toRomajiStrictly(kana))),
|
||||||
|
);
|
||||||
@@ -0,0 +1,166 @@
|
|||||||
|
import path from 'node:path';
|
||||||
|
import url from 'node:url';
|
||||||
|
|
||||||
|
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||||
|
|
||||||
|
import { createTokenizer, type KuromojiTokenizer } from './tokenizer';
|
||||||
|
import { TokenType } from '../common/types';
|
||||||
|
|
||||||
|
let kuromoji: KuromojiTokenizer;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||||
|
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('tokenizer', () => {
|
||||||
|
it('should tokenize mixed Japanese text', () => {
|
||||||
|
const tokenizer = createTokenizer({ kuromoji });
|
||||||
|
const tokens = tokenizer.tokenize('僕の和風本当上手');
|
||||||
|
|
||||||
|
// Get all token definitions
|
||||||
|
const tokenDefs = [...tokenizer.tokens.values()];
|
||||||
|
|
||||||
|
// Should have tokens of various types
|
||||||
|
const types = new Set(tokenDefs.map(t => t.type));
|
||||||
|
expect(types.has(TokenType.Han)).toBe(true);
|
||||||
|
expect(types.has(TokenType.Pinyin)).toBe(true);
|
||||||
|
expect(types.has(TokenType.Kana)).toBe(true);
|
||||||
|
expect(types.has(TokenType.Romaji)).toBe(true);
|
||||||
|
|
||||||
|
const getTokenTextsAt = (pos: number, type: TokenType) => tokens
|
||||||
|
.filter(t => t.start <= pos && t.end > pos && tokenDefs.find(d => d.id === t.id)?.type === type)
|
||||||
|
.map(t => tokenDefs.find(d => d.id === t.id)!.text);
|
||||||
|
|
||||||
|
// Position 0: 僕
|
||||||
|
expect(getTokenTextsAt(0, TokenType.Han)).toContain('僕');
|
||||||
|
expect(getTokenTextsAt(0, TokenType.Pinyin)).toContain('pu');
|
||||||
|
expect(getTokenTextsAt(0, TokenType.Kana)).toContain('ボク');
|
||||||
|
expect(getTokenTextsAt(0, TokenType.Romaji)).toContain('boku');
|
||||||
|
|
||||||
|
// Position 1: の (hiragana, no Han/Pinyin)
|
||||||
|
expect(getTokenTextsAt(1, TokenType.Han)).toEqual([]);
|
||||||
|
expect(getTokenTextsAt(1, TokenType.Pinyin)).toEqual([]);
|
||||||
|
expect(getTokenTextsAt(1, TokenType.Kana)).toContain('ノ');
|
||||||
|
expect(getTokenTextsAt(1, TokenType.Romaji)).toContain('no');
|
||||||
|
|
||||||
|
// Position 2: 和
|
||||||
|
expect(getTokenTextsAt(2, TokenType.Han)).toContain('和');
|
||||||
|
expect(getTokenTextsAt(2, TokenType.Pinyin)).toContain('he');
|
||||||
|
expect(getTokenTextsAt(2, TokenType.Kana)).toContain('ワ');
|
||||||
|
expect(getTokenTextsAt(2, TokenType.Romaji)).toContain('wa');
|
||||||
|
|
||||||
|
// Position 3: 風
|
||||||
|
expect(getTokenTextsAt(3, TokenType.Han)).toContain('風');
|
||||||
|
expect(getTokenTextsAt(3, TokenType.Han)).toContain('风'); // simplified variant
|
||||||
|
expect(getTokenTextsAt(3, TokenType.Pinyin)).toContain('feng');
|
||||||
|
expect(getTokenTextsAt(3, TokenType.Kana)).toContain('フウ');
|
||||||
|
expect(getTokenTextsAt(3, TokenType.Romaji)).toContain('fu');
|
||||||
|
|
||||||
|
// Position 4: 本
|
||||||
|
expect(getTokenTextsAt(4, TokenType.Han)).toContain('本');
|
||||||
|
expect(getTokenTextsAt(4, TokenType.Pinyin)).toContain('ben');
|
||||||
|
expect(getTokenTextsAt(4, TokenType.Kana)).toContain('ホン');
|
||||||
|
expect(getTokenTextsAt(4, TokenType.Romaji)).toContain('hon');
|
||||||
|
|
||||||
|
// Position 5: 当
|
||||||
|
expect(getTokenTextsAt(5, TokenType.Han)).toContain('当');
|
||||||
|
expect(getTokenTextsAt(5, TokenType.Han)).toContain('當'); // traditional variant
|
||||||
|
expect(getTokenTextsAt(5, TokenType.Pinyin)).toContain('dang');
|
||||||
|
expect(getTokenTextsAt(5, TokenType.Kana)).toContain('トウ');
|
||||||
|
expect(getTokenTextsAt(5, TokenType.Romaji)).toContain('to'); // normalized: tou -> to
|
||||||
|
|
||||||
|
// Position 6: 上
|
||||||
|
expect(getTokenTextsAt(6, TokenType.Han)).toContain('上');
|
||||||
|
expect(getTokenTextsAt(6, TokenType.Pinyin)).toContain('shang');
|
||||||
|
expect(getTokenTextsAt(6, TokenType.Kana)).toContain('ジョウ');
|
||||||
|
expect(getTokenTextsAt(6, TokenType.Romaji)).toContain('jo'); // normalized: jou -> jo
|
||||||
|
|
||||||
|
// Position 7: 手
|
||||||
|
expect(getTokenTextsAt(7, TokenType.Han)).toContain('手');
|
||||||
|
expect(getTokenTextsAt(7, TokenType.Pinyin)).toContain('shou');
|
||||||
|
expect(getTokenTextsAt(7, TokenType.Kana)).toContain('シュ');
|
||||||
|
expect(getTokenTextsAt(7, TokenType.Romaji)).toContain('shu');
|
||||||
|
|
||||||
|
// Check that tokens cover the entire input
|
||||||
|
expect(tokens.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
// Check some specific token definitions exist
|
||||||
|
const hanTokenTexts = tokenDefs.filter(t => t.type === TokenType.Han).map(t => t.text);
|
||||||
|
expect(hanTokenTexts).toContain('僕');
|
||||||
|
expect(hanTokenTexts).toContain('和');
|
||||||
|
expect(hanTokenTexts).toContain('風');
|
||||||
|
|
||||||
|
// Check kana readings exist for kanji
|
||||||
|
const kanaTokenTexts = tokenDefs.filter(t => t.type === TokenType.Kana).map(t => t.text);
|
||||||
|
expect(kanaTokenTexts).toContain('ボク'); // 僕 -> ボク
|
||||||
|
|
||||||
|
// Check romaji readings exist
|
||||||
|
const romajiTokenTexts = tokenDefs.filter(t => t.type === TokenType.Romaji).map(t => t.text);
|
||||||
|
expect(romajiTokenTexts).toContain('boku'); // 僕 -> boku
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not create duplicate tokens when tokenizing multiple documents', () => {
|
||||||
|
const tokenizer = createTokenizer({ kuromoji });
|
||||||
|
|
||||||
|
// Tokenize multiple music names that share some characters
|
||||||
|
tokenizer.tokenize('僕の和風本当上手');
|
||||||
|
tokenizer.tokenize('僕');
|
||||||
|
tokenizer.tokenize('和風');
|
||||||
|
|
||||||
|
// Check that there are no duplicate tokens
|
||||||
|
const tokenDefs = [...tokenizer.tokens.values()];
|
||||||
|
const tokenKeys = tokenDefs.map(t => `${t.type}:${t.text}`);
|
||||||
|
const uniqueKeys = new Set(tokenKeys);
|
||||||
|
|
||||||
|
expect(tokenKeys.length).toBe(uniqueKeys.size);
|
||||||
|
|
||||||
|
// Also check that IDs are unique
|
||||||
|
const ids = tokenDefs.map(t => t.id);
|
||||||
|
const uniqueIds = new Set(ids);
|
||||||
|
expect(ids.length).toBe(uniqueIds.size);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle Raw tokens for non-CJK characters', () => {
|
||||||
|
const tokenizer = createTokenizer({ kuromoji });
|
||||||
|
tokenizer.tokenize('a-b');
|
||||||
|
|
||||||
|
const tokenDefs = [...tokenizer.tokens.values()];
|
||||||
|
const rawTokenTexts = tokenDefs.filter(t => t.type === TokenType.Raw).map(t => t.text);
|
||||||
|
|
||||||
|
expect(rawTokenTexts).toContain('a'); // normalized to lowercase
|
||||||
|
expect(rawTokenTexts).toContain('-');
|
||||||
|
expect(rawTokenTexts).toContain('b');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should tokenize compound word "今日" with both individual and combined readings', () => {
|
||||||
|
const tokenizer = createTokenizer({ kuromoji });
|
||||||
|
const tokens = tokenizer.tokenize('今日');
|
||||||
|
const tokenDefs = [...tokenizer.tokens.values()];
|
||||||
|
|
||||||
|
const getTokensWithSpan = (type: TokenType, start: number, end: number) => tokens
|
||||||
|
.filter(t => t.start === start && t.end === end && tokenDefs.find(d => d.id === t.id)?.type === type)
|
||||||
|
.map(t => tokenDefs.find(d => d.id === t.id)!.text);
|
||||||
|
|
||||||
|
// Individual character readings at position 0: 今
|
||||||
|
expect(getTokensWithSpan(TokenType.Han, 0, 1)).toContain('今');
|
||||||
|
expect(getTokensWithSpan(TokenType.Pinyin, 0, 1)).toContain('jin');
|
||||||
|
expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('コン');
|
||||||
|
expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('イマ');
|
||||||
|
expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('kon');
|
||||||
|
expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('ima');
|
||||||
|
|
||||||
|
// Individual character readings at position 1: 日
|
||||||
|
expect(getTokensWithSpan(TokenType.Han, 1, 2)).toContain('日');
|
||||||
|
expect(getTokensWithSpan(TokenType.Pinyin, 1, 2)).toContain('ri');
|
||||||
|
expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ニチ');
|
||||||
|
expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ヒ');
|
||||||
|
expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('niti');
|
||||||
|
expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('hi');
|
||||||
|
|
||||||
|
// Combined reading for "今日" [0, 2] - this is an indivisible compound word
|
||||||
|
expect(getTokensWithSpan(TokenType.Kana, 0, 2)).toContain('キョウ');
|
||||||
|
expect(getTokensWithSpan(TokenType.Romaji, 0, 2)).toContain('kyo'); // normalized: kyou -> kyo
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
import type { TokenizerBuilder } from '@patdx/kuromoji';
|
||||||
|
|
||||||
|
import { getHanVariants, getPinyinCandidates } from './han';
|
||||||
|
import { createKanaTranscriptionEnumerator, createRomajiTranscriptionEnumerator, isMaybeJapanese } from './japanese';
|
||||||
|
import { normalizeByCodePoint } from '../common/normalize';
|
||||||
|
import { TokenType, type TokenDefinition } from '../common/types';
|
||||||
|
|
||||||
|
export interface Token {
|
||||||
|
id: number;
|
||||||
|
start: number;
|
||||||
|
end: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type KuromojiTokenizer = Awaited<ReturnType<TokenizerBuilder['build']>>;
|
||||||
|
export interface TokenizerOptions {
|
||||||
|
kuromoji: KuromojiTokenizer;
|
||||||
|
}
|
||||||
|
export const createTokenizer = (options: TokenizerOptions) => {
|
||||||
|
const tokens = new Map<string, TokenDefinition>();
|
||||||
|
let nextId = 0;
|
||||||
|
const ensureToken = (type: TokenType, text: string) => {
|
||||||
|
const key = `${type}:${text}`;
|
||||||
|
let tokenDefinition = tokens.get(key);
|
||||||
|
if (tokenDefinition) return tokenDefinition;
|
||||||
|
tokenDefinition = { id: nextId++, type, text, codePointLength: [...text].length };
|
||||||
|
tokens.set(key, tokenDefinition);
|
||||||
|
return tokenDefinition;
|
||||||
|
};
|
||||||
|
|
||||||
|
const enumerateAllKanaCombinations = createKanaTranscriptionEnumerator(options.kuromoji);
|
||||||
|
const enumerateAllRomajiCombinations = createRomajiTranscriptionEnumerator(options.kuromoji);
|
||||||
|
const tokenize = (text: string) => {
|
||||||
|
const results: Token[] = [];
|
||||||
|
const emitter = (start: number, end: number) => (type: TokenType, text: string) => results.push({ id: ensureToken(type, text).id, start, end });
|
||||||
|
|
||||||
|
const emitMaybeJapanese = (codePoints: string[], offset: number) => {
|
||||||
|
for (const { start, length, transcriptions } of enumerateAllKanaCombinations(codePoints)) {
|
||||||
|
const emit = emitter(offset + start, offset + start + length);
|
||||||
|
for (const transcription of transcriptions) emit(TokenType.Kana, transcription);
|
||||||
|
}
|
||||||
|
for (const { start, length, transcriptions } of enumerateAllRomajiCombinations(codePoints)) {
|
||||||
|
const emit = emitter(offset + start, offset + start + length);
|
||||||
|
for (const transcription of transcriptions) emit(TokenType.Romaji, transcription);
|
||||||
|
}
|
||||||
|
for (let i = 0; i < codePoints.length; i++) {
|
||||||
|
// Single character may have not only kana readings, but also Chinese pronunciations or Simplified/Traditional/Japanese variants.
|
||||||
|
const character = codePoints[i]!;
|
||||||
|
const hanAlternates = getHanVariants(character); // All possible variant characters (Simplified/Traditional/Japanese)
|
||||||
|
const pinyinAlternates = Array.from(new Set(hanAlternates.flatMap(han => getPinyinCandidates(han)))); // All possible pinyin candidates
|
||||||
|
const emit = emitter(offset + i, offset + i + 1);
|
||||||
|
for (const han of hanAlternates) emit(TokenType.Han, han);
|
||||||
|
for (const pinyin of pinyinAlternates) emit(TokenType.Pinyin, pinyin);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
const emitRaw = (codePoint: string, offset: number) => emitter(offset, offset + 1)(TokenType.Raw, codePoint);
|
||||||
|
|
||||||
|
const codePoints = [...normalizeByCodePoint(text)];
|
||||||
|
for (let start = 0; start < codePoints.length;) {
|
||||||
|
const codePoint = codePoints[start]!;
|
||||||
|
|
||||||
|
const consequentCharsets = [
|
||||||
|
{ is: isMaybeJapanese, emit: emitMaybeJapanese },
|
||||||
|
];
|
||||||
|
let emitted = false;
|
||||||
|
for (const { is, emit } of consequentCharsets) {
|
||||||
|
let length = 0;
|
||||||
|
while (start + length < codePoints.length && is(codePoints[start + length]!)) length++;
|
||||||
|
if (length > 0) {
|
||||||
|
emit(codePoints.slice(start, start + length), start);
|
||||||
|
start += length;
|
||||||
|
emitted = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (emitted) continue;
|
||||||
|
|
||||||
|
// Skip whitespaces
|
||||||
|
if (/\s/.test(codePoint)) {
|
||||||
|
start++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
emitRaw(codePoint, start);
|
||||||
|
start++;
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
tokens,
|
||||||
|
tokenize,
|
||||||
|
};
|
||||||
|
};
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
import { traverseTrie } from '../common';
|
||||||
|
import { buildTrie, graftTriePaths } from './trie';
|
||||||
|
|
||||||
|
describe('graftTriePaths', () => {
|
||||||
|
it('should graft paths according to normalization rules', () => {
|
||||||
|
// Build a trie with tokens containing normalized forms
|
||||||
|
const trie = buildTrie([
|
||||||
|
[0, 'sya'], // normalized form of "sha"
|
||||||
|
[1, 'tu'], // normalized form of "tsu"
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Graft paths so that "sha" -> "sya" and "tsu" -> "tu"
|
||||||
|
graftTriePaths(trie, {
|
||||||
|
sha: 'sya',
|
||||||
|
tsu: 'tu',
|
||||||
|
});
|
||||||
|
|
||||||
|
// Now we should be able to traverse using both the original and grafted paths
|
||||||
|
const syaNode = traverseTrie(trie, 'sya');
|
||||||
|
const shaNode = traverseTrie(trie, 'sha');
|
||||||
|
expect(syaNode).toBeDefined();
|
||||||
|
expect(shaNode).toBeDefined();
|
||||||
|
expect(syaNode).toBe(shaNode); // Both paths should lead to the same node
|
||||||
|
|
||||||
|
const tuNode = traverseTrie(trie, 'tu');
|
||||||
|
const tsuNode = traverseTrie(trie, 'tsu');
|
||||||
|
expect(tuNode).toBeDefined();
|
||||||
|
expect(tsuNode).toBeDefined();
|
||||||
|
expect(tuNode).toBe(tsuNode);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle chained graft rules', () => {
|
||||||
|
const trie = buildTrie([
|
||||||
|
[0, 'o'], // normalized vowel
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Chain: "ou" -> "o", "oo" -> "o"
|
||||||
|
graftTriePaths(trie, {
|
||||||
|
ou: 'o',
|
||||||
|
oo: 'o',
|
||||||
|
});
|
||||||
|
|
||||||
|
const oNode = traverseTrie(trie, 'o');
|
||||||
|
const ouNode = traverseTrie(trie, 'ou');
|
||||||
|
const ooNode = traverseTrie(trie, 'oo');
|
||||||
|
|
||||||
|
expect(oNode).toBeDefined();
|
||||||
|
expect(ouNode).toBe(oNode);
|
||||||
|
expect(ooNode).toBe(oNode);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,115 @@
|
|||||||
|
import { traverseTrie, type TrieNode } from '../common';
|
||||||
|
|
||||||
|
const newNode = (parent?: TrieNode): TrieNode => ({ parent, children: new Map(), tokenIds: [], subTreeTokenIds: [] });
|
||||||
|
|
||||||
|
// Assume tokens are unique.
|
||||||
|
export const buildTrie = (tokens: [id: number, text: string][]) => {
|
||||||
|
const root = newNode(undefined);
|
||||||
|
for (const [id, text] of tokens) {
|
||||||
|
let node = root;
|
||||||
|
for (const char of text) {
|
||||||
|
const codePoint = char.codePointAt(0)!;
|
||||||
|
let childNode = node.children.get(codePoint);
|
||||||
|
if (!childNode) {
|
||||||
|
childNode = newNode(node);
|
||||||
|
node.children.set(codePoint, childNode);
|
||||||
|
}
|
||||||
|
node = childNode;
|
||||||
|
node.subTreeTokenIds.push(id);
|
||||||
|
}
|
||||||
|
node.tokenIds.push(id);
|
||||||
|
}
|
||||||
|
return root;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const graftTriePaths = (root: TrieNode, rules: Record<string, string>) => {
|
||||||
|
for (const [inputPhrase, graftTo] of Object.entries(rules)) if ([...graftTo].length > [...inputPhrase].length) throw new Error(`Graft rule ${inputPhrase} -> ${graftTo} maps to longer string and may cause infinite loop`);
|
||||||
|
const visitedNodes = new Set<TrieNode>();
|
||||||
|
const graftFromNode = (node: TrieNode, recursiveChildren: boolean) => {
|
||||||
|
if (visitedNodes.has(node)) return;
|
||||||
|
visitedNodes.add(node);
|
||||||
|
if (recursiveChildren) for (const [, childNode] of node.children) graftFromNode(childNode, true);
|
||||||
|
while (true) {
|
||||||
|
const nodesWithNewGraftedChildren = new Map<TrieNode, /* depth from initial node */ number>();
|
||||||
|
for (const [inputPhrase, graftTo] of Object.entries(rules)) {
|
||||||
|
const targetNode = traverseTrie(node, graftTo);
|
||||||
|
if (!targetNode) continue;
|
||||||
|
const codePoints = [...inputPhrase];
|
||||||
|
const graftedPath = Array.from<TrieNode>({ length: codePoints.length - 1 });
|
||||||
|
let isGrafted = false;
|
||||||
|
let currentNode = node;
|
||||||
|
for (let i = 0; i < codePoints.length; i++) {
|
||||||
|
const codePoint = codePoints[i]!.codePointAt(0)!;
|
||||||
|
let childNode = currentNode.children.get(codePoint);
|
||||||
|
if (i === codePoints.length - 1) {
|
||||||
|
if (childNode) {
|
||||||
|
if (childNode !== targetNode) throw new Error(`Grafted path ${inputPhrase} conflicts with existing path`);
|
||||||
|
// Already grafted
|
||||||
|
} else {
|
||||||
|
currentNode.children.set(codePoint, childNode = targetNode);
|
||||||
|
isGrafted = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!childNode) {
|
||||||
|
childNode = newNode(currentNode);
|
||||||
|
childNode.subTreeTokenIds = targetNode.subTreeTokenIds;
|
||||||
|
currentNode.children.set(codePoint, childNode);
|
||||||
|
} else {
|
||||||
|
// Part of another grafted path?
|
||||||
|
childNode.subTreeTokenIds = Array.from(new Set([...childNode.subTreeTokenIds, ...targetNode.subTreeTokenIds]));
|
||||||
|
}
|
||||||
|
graftedPath[i] = currentNode = childNode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isGrafted) for (const [i, nodeToAdd] of graftedPath.entries()) nodesWithNewGraftedChildren.set(nodeToAdd, i + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nodesWithNewGraftedChildren.size > 0) {
|
||||||
|
// Re-check graft rules on the newly grafted path
|
||||||
|
// 1. No need to recursive other children (not on this path) since their children are not affected
|
||||||
|
// 2. No need to consider ancestors of this node since they're handled later (we run in DFS order)
|
||||||
|
const sortedNodes = [...nodesWithNewGraftedChildren.entries()].sort((a, b) => b[1] - a[1]);
|
||||||
|
for (const [changedNode] of sortedNodes) graftFromNode(changedNode, false);
|
||||||
|
} else {
|
||||||
|
// No new grafts applied
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
graftFromNode(root, true);
|
||||||
|
};
|
||||||
|
|
||||||
|
export const serializeTrie = (root: TrieNode) => {
|
||||||
|
const nodeEntries = new Map<TrieNode, {
|
||||||
|
id: number;
|
||||||
|
visited: boolean;
|
||||||
|
data?: number[];
|
||||||
|
}>();
|
||||||
|
let currentId = 0;
|
||||||
|
const getNodeEntry = (node: TrieNode) => {
|
||||||
|
let entry = nodeEntries.get(node);
|
||||||
|
if (!entry) {
|
||||||
|
entry = { id: ++currentId, visited: false };
|
||||||
|
nodeEntries.set(node, entry);
|
||||||
|
}
|
||||||
|
return entry;
|
||||||
|
};
|
||||||
|
const serializeNode = (node: TrieNode) => {
|
||||||
|
const entry = getNodeEntry(node);
|
||||||
|
if (entry.visited) return entry.id;
|
||||||
|
entry.visited = true;
|
||||||
|
const children = [...node.children.entries()].map(([codePoint, childNode]) => [codePoint, serializeNode(childNode)] as const);
|
||||||
|
entry.data = [
|
||||||
|
node.parent ? getNodeEntry(node.parent).id : 0,
|
||||||
|
...children.map(child => child[0]), // code points
|
||||||
|
...children.map(child => child[1]), // child node ids
|
||||||
|
// End of children list (<= 0 are not valid code points nor node IDs)
|
||||||
|
...node.tokenIds.length > 0
|
||||||
|
? node.tokenIds.map(tokenId => -(tokenId + 1)) // Use the negative value of (tokenId + 1)
|
||||||
|
: [0], // End of children list, no token IDs (token IDs are encoded to negative values)
|
||||||
|
];
|
||||||
|
return entry.id;
|
||||||
|
};
|
||||||
|
serializeNode(root);
|
||||||
|
return [...nodeEntries.values()].sort((a, b) => a.id - b.id).flatMap(node => node.data ?? []);
|
||||||
|
};
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
import { getSpanLength, TokenType } from '../common';
|
||||||
|
import type { SearchResult } from './search';
|
||||||
|
|
||||||
|
export type HighlightedTextPart = /* not highlighted */ string | /* highlighted */ { highlight: string };
|
||||||
|
|
||||||
|
export const highlightSearchResult = (resultDocument: SearchResult): HighlightedTextPart[] => {
|
||||||
|
const highlightResult: HighlightedTextPart[] = [];
|
||||||
|
let previousHighlightEnd = 0;
|
||||||
|
for (const token of resultDocument.tokens) {
|
||||||
|
const notHighlightedText = resultDocument.documentCodePoints.slice(previousHighlightEnd, token.documentOffset.start).join('');
|
||||||
|
if (notHighlightedText.length > 0) highlightResult.push(notHighlightedText);
|
||||||
|
const highlightEnd = token.isTokenPrefixMatching && (token.definition.type === TokenType.Kana)
|
||||||
|
? token.documentOffset.start + Math.max(
|
||||||
|
1,
|
||||||
|
Math.round(
|
||||||
|
getSpanLength(token.documentOffset) *
|
||||||
|
Math.min(1, getSpanLength(token.inputOffset) / token.definition.codePointLength),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
: token.documentOffset.end;
|
||||||
|
highlightResult.push({ highlight: resultDocument.documentCodePoints.slice(token.documentOffset.start, highlightEnd).join('') });
|
||||||
|
previousHighlightEnd = highlightEnd;
|
||||||
|
}
|
||||||
|
if (previousHighlightEnd < resultDocument.documentCodePoints.length) highlightResult.push(resultDocument.documentCodePoints.slice(previousHighlightEnd).join(''));
|
||||||
|
return highlightResult;
|
||||||
|
};
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
export * from './trie';
|
||||||
|
export * from './inverted-index';
|
||||||
|
export * from './search';
|
||||||
|
export * from './highlight';
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
import { deserializeTrie } from './trie';
|
||||||
|
import type { TrieNode } from '../common';
|
||||||
|
import type { CompressedInvertedIndex, OffsetSpan, TokenDefinition } from '../common/types';
|
||||||
|
|
||||||
|
export interface TokenDocumentReference {
|
||||||
|
documentId: number;
|
||||||
|
offsets: OffsetSpan[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TokenDefinitionExtended extends TokenDefinition {
|
||||||
|
references: TokenDocumentReference[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const mergeMap = <K, V>(...maps: Map<K, V>[]) => {
|
||||||
|
const result = new Map<K, V>();
|
||||||
|
for (const map of maps) for (const [key, value] of map.entries()) result.set(key, value);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
|
||||||
|
export interface LoadedInvertedIndex {
|
||||||
|
documents: string[];
|
||||||
|
documentCodePoints: string[][];
|
||||||
|
tokenDefinitions: TokenDefinitionExtended[];
|
||||||
|
tries: {
|
||||||
|
romaji: TrieNode;
|
||||||
|
kana: TrieNode;
|
||||||
|
other: TrieNode;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export const loadInvertedIndex = (compressed: CompressedInvertedIndex): LoadedInvertedIndex => {
|
||||||
|
const documents = compressed.documents;
|
||||||
|
const documentCodePoints = documents.map(document => [...document]);
|
||||||
|
|
||||||
|
const romajiTrie = deserializeTrie(compressed.tries.romaji);
|
||||||
|
const kanaTrie = deserializeTrie(compressed.tries.kana);
|
||||||
|
const otherTrie = deserializeTrie(compressed.tries.other);
|
||||||
|
|
||||||
|
const tokenCodePoints = mergeMap(romajiTrie.tokenCodePoints, kanaTrie.tokenCodePoints, otherTrie.tokenCodePoints);
|
||||||
|
const tokenDefinitions = compressed.tokenTypes.map<TokenDefinitionExtended>((type, index) => ({
|
||||||
|
id: index, type, text: tokenCodePoints.get(index)!.join(''),
|
||||||
|
codePointLength: tokenCodePoints.get(index)!.length,
|
||||||
|
references: compressed.tokenReferences[index]!.map<TokenDocumentReference>(([documentId, ...offsets]) => ({
|
||||||
|
documentId: documentId!,
|
||||||
|
offsets: Array.from({ length: offsets.length / 2 }, (_, i) => ({ start: offsets[i * 2]!, end: offsets[i * 2 + 1]! })),
|
||||||
|
})),
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
documents,
|
||||||
|
documentCodePoints,
|
||||||
|
tokenDefinitions,
|
||||||
|
tries: {
|
||||||
|
romaji: romajiTrie.root,
|
||||||
|
kana: kanaTrie.root,
|
||||||
|
other: otherTrie.root,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
};
|
||||||
@@ -0,0 +1,258 @@
|
|||||||
|
import { highlightSearchResult } from './highlight';
|
||||||
|
import { getTrieNodeTokenIds } from './trie';
|
||||||
|
import type { TrieNode } from '../common';
|
||||||
|
import { traverseTrieStep } from '../common';
|
||||||
|
import type { LoadedInvertedIndex } from './inverted-index';
|
||||||
|
import { normalizeByCodePoint, toKatakana } from '../common/normalize';
|
||||||
|
import { type OffsetSpan, type TokenDefinition, TokenType } from '../common/types';
|
||||||
|
import { getSpanLength } from '../common/utils';
|
||||||
|
|
||||||
|
const IGNORABLE_CODE_POINTS = /[\s\u3099\u309A]/u;
|
||||||
|
|
||||||
|
enum TokenTypePrefixMatchingPolicy {
|
||||||
|
AlwaysAllow,
|
||||||
|
NeverAllow,
|
||||||
|
AllowOnlyAtInputEnd,
|
||||||
|
}
|
||||||
|
const tokenTypePrefixMatchingPolicy: Record<TokenType, TokenTypePrefixMatchingPolicy> = {
|
||||||
|
[TokenType.Romaji]: TokenTypePrefixMatchingPolicy.NeverAllow,
|
||||||
|
[TokenType.Kana]: TokenTypePrefixMatchingPolicy.AlwaysAllow,
|
||||||
|
// These token types are in an "other" Trie
|
||||||
|
[TokenType.Han]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
|
||||||
|
[TokenType.Pinyin]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd,
|
||||||
|
[TokenType.Raw]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
|
||||||
|
};
|
||||||
|
const shouldAllowPrefixMatching = (tokenType: TokenType, isAtInputEnd: boolean) =>
|
||||||
|
tokenTypePrefixMatchingPolicy[tokenType] === TokenTypePrefixMatchingPolicy.AlwaysAllow ||
|
||||||
|
(tokenTypePrefixMatchingPolicy[tokenType] !== TokenTypePrefixMatchingPolicy.NeverAllow && isAtInputEnd);
|
||||||
|
|
||||||
|
export interface SearchResultToken {
|
||||||
|
definition: TokenDefinition;
|
||||||
|
documentOffset: OffsetSpan;
|
||||||
|
inputOffset: OffsetSpan;
|
||||||
|
isTokenPrefixMatching: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ComparableStateTraits<T> {
|
||||||
|
getRangeCount: (state: T) => number;
|
||||||
|
getPrefixMatchCount: (state: T) => number;
|
||||||
|
getFirstTokenDocumentOffset: (state: T) => OffsetSpan;
|
||||||
|
getLastTokenDocumentOffset: (state: T) => OffsetSpan;
|
||||||
|
getLastToken?: (state: T) => SearchResultToken; // Not on intermediate results
|
||||||
|
getMatchRatioLevel?: (state: T) => number; // Not on intermediate/candidate results
|
||||||
|
getMatchRatio: (state: T) => number;
|
||||||
|
// Called when all other comparisons are equal
|
||||||
|
nextComparer?: (a: T, b: T) => number; // Not on intermediate/candidate results
|
||||||
|
}
|
||||||
|
|
||||||
|
const getComparerForTraits = <T>(traits: ComparableStateTraits<T>) => (a: T, b: T) => {
|
||||||
|
// Prefer matches that not relying on end-of-input loose matching (full match over prefix match)
|
||||||
|
if (traits.getLastToken) {
|
||||||
|
const aLastToken = traits.getLastToken(a), bLastToken = traits.getLastToken(b);
|
||||||
|
const aDidPrefixMatchByTokenType = aLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[aLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
|
||||||
|
const bDidPrefixMatchByTokenType = bLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[bLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
|
||||||
|
if (aDidPrefixMatchByTokenType !== bDidPrefixMatchByTokenType) return aDidPrefixMatchByTokenType ? 1 : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer results that matched fewer discontinuous ranges over more
|
||||||
|
const aRangeCount = traits.getRangeCount(a), bRangeCount = traits.getRangeCount(b);
|
||||||
|
if (aRangeCount !== bRangeCount) return aRangeCount - bRangeCount;
|
||||||
|
|
||||||
|
// Prefer results that matches first token in document earlier over later
|
||||||
|
const aFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(a), bFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(b);
|
||||||
|
if (aFirstTokenDocumentOffset.start !== bFirstTokenDocumentOffset.start) return aFirstTokenDocumentOffset.start - bFirstTokenDocumentOffset.start;
|
||||||
|
|
||||||
|
// Prefer results that has higher match ratio (but don't distinguish similar ratios, so we introduced `matchRatioLevel`)
|
||||||
|
if (traits.getMatchRatioLevel) {
|
||||||
|
const aMatchRatioLevel = traits.getMatchRatioLevel(a), bMatchRatioLevel = traits.getMatchRatioLevel(b);
|
||||||
|
if (aMatchRatioLevel !== bMatchRatioLevel) return bMatchRatioLevel - aMatchRatioLevel;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer results that last token occurred earlier (if same, ended earlier) in the document over later
|
||||||
|
const aLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(a), bLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(b);
|
||||||
|
if (aLastTokenDocumentOffset.start !== bLastTokenDocumentOffset.start) return aLastTokenDocumentOffset.start - bLastTokenDocumentOffset.start;
|
||||||
|
if (aLastTokenDocumentOffset.end !== bLastTokenDocumentOffset.end) return aLastTokenDocumentOffset.end - bLastTokenDocumentOffset.end;
|
||||||
|
|
||||||
|
// Prefer results that has higher match ratio (precisely)
|
||||||
|
const aMatchRatio = traits.getMatchRatio(a), bMatchRatio = traits.getMatchRatio(b);
|
||||||
|
if (aMatchRatio !== bMatchRatio) return bMatchRatio - aMatchRatio;
|
||||||
|
|
||||||
|
return traits.nextComparer?.(a, b) ?? 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
interface IntermediateResult {
|
||||||
|
previousState?: IntermediateResult;
|
||||||
|
firstTokenDocumentOffset: OffsetSpan;
|
||||||
|
rangeCount: number;
|
||||||
|
tokenCount: number;
|
||||||
|
prefixMatchCount: number;
|
||||||
|
matchedTokenLength: number;
|
||||||
|
tokenId: number;
|
||||||
|
documentOffset: OffsetSpan;
|
||||||
|
inputOffset: OffsetSpan;
|
||||||
|
isTokenPrefixMatching: boolean;
|
||||||
|
}
|
||||||
|
const compareIntermediateResult = getComparerForTraits<IntermediateResult>({
|
||||||
|
getRangeCount: state => state.rangeCount,
|
||||||
|
getPrefixMatchCount: state => state.prefixMatchCount,
|
||||||
|
getFirstTokenDocumentOffset: state => state.firstTokenDocumentOffset,
|
||||||
|
getLastTokenDocumentOffset: state => state.documentOffset,
|
||||||
|
getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since intermediate results are for same document
|
||||||
|
});
|
||||||
|
|
||||||
|
interface CandidateResult {
|
||||||
|
tokens: SearchResultToken[];
|
||||||
|
prefixMatchCount: number;
|
||||||
|
matchedTokenLength: number;
|
||||||
|
rangeCount: number;
|
||||||
|
}
|
||||||
|
const compareCandidateResult = getComparerForTraits<CandidateResult>({
|
||||||
|
getRangeCount: state => state.rangeCount,
|
||||||
|
getPrefixMatchCount: state => state.prefixMatchCount,
|
||||||
|
getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
|
||||||
|
getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
|
||||||
|
getLastToken: state => state.tokens[state.tokens.length - 1]!,
|
||||||
|
getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since candidate results are for same document
|
||||||
|
});
|
||||||
|
|
||||||
|
export interface SearchResult {
|
||||||
|
documentId: number;
|
||||||
|
documentText: string;
|
||||||
|
documentCodePoints: string[];
|
||||||
|
tokens: SearchResultToken[];
|
||||||
|
prefixMatchCount: number;
|
||||||
|
rangeCount: number;
|
||||||
|
matchRatio: number;
|
||||||
|
matchRatioLevel: number;
|
||||||
|
}
|
||||||
|
const compareFinalResult = getComparerForTraits<SearchResult>({
|
||||||
|
getRangeCount: state => state.rangeCount,
|
||||||
|
getPrefixMatchCount: state => state.prefixMatchCount,
|
||||||
|
getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
|
||||||
|
getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
|
||||||
|
getLastToken: state => state.tokens[state.tokens.length - 1]!,
|
||||||
|
getMatchRatio: state => state.matchRatio,
|
||||||
|
getMatchRatioLevel: state => Math.round(state.matchRatio * 5),
|
||||||
|
nextComparer: (a, b) => a.documentText === b.documentText ? 0 : a.documentText < b.documentText ? -1 : 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
const hasNonEmptyCharacters = (documentCodePoints: string[], start: number, end: number) => start !== end && !documentCodePoints.slice(start, end).every(char => /\s/.test(char));
|
||||||
|
|
||||||
|
export const searchInvertedIndex = (invertedIndex: LoadedInvertedIndex, text: string): SearchResult[] => {
|
||||||
|
const { documents, documentCodePoints, tokenDefinitions, tries } = invertedIndex;
|
||||||
|
|
||||||
|
const codePoints = [...toKatakana(normalizeByCodePoint(text))];
|
||||||
|
// dp[i] = docId => end => IntermediateResult, starts from dp[-1] (l === 0), ends at dp[N - 1] (r === N - 1)
|
||||||
|
const dp = Array.from({ length: codePoints.length }, () => new Map<number, Record<number, IntermediateResult>>());
|
||||||
|
for (let l = 0; l < codePoints.length; l++) {
|
||||||
|
if (l !== 0 && dp[l - 1]!.size === 0) continue; // No documents match input from beginning to this position
|
||||||
|
let romajiNode: TrieNode | undefined = tries.romaji;
|
||||||
|
let kanaNode: TrieNode | undefined = tries.kana;
|
||||||
|
let otherNode: TrieNode | undefined = tries.other;
|
||||||
|
for (let r = l; r < codePoints.length && (romajiNode || kanaNode || otherNode); r++) { // [l, r]
|
||||||
|
const codePoint = codePoints[r]!;
|
||||||
|
romajiNode = traverseTrieStep(romajiNode, codePoint, IGNORABLE_CODE_POINTS);
|
||||||
|
kanaNode = traverseTrieStep(kanaNode, codePoint, IGNORABLE_CODE_POINTS);
|
||||||
|
otherNode = traverseTrieStep(otherNode, codePoint, IGNORABLE_CODE_POINTS);
|
||||||
|
const reachingInputEnd = r === codePoints.length - 1;
|
||||||
|
const matchingTokenIds = new Set([
|
||||||
|
// Allow suffix matching of romaji/other tokens if we're at the end of the input
|
||||||
|
...getTrieNodeTokenIds(romajiNode, shouldAllowPrefixMatching(TokenType.Romaji, reachingInputEnd)),
|
||||||
|
...getTrieNodeTokenIds(kanaNode, shouldAllowPrefixMatching(TokenType.Kana, reachingInputEnd)),
|
||||||
|
...getTrieNodeTokenIds(otherNode, reachingInputEnd),
|
||||||
|
]);
|
||||||
|
for (const tokenId of matchingTokenIds) for (const { documentId, offsets } of tokenDefinitions[tokenId]!.references) {
|
||||||
|
const isTokenPrefixMatching = !romajiNode?.tokenIds.includes(tokenId) && !kanaNode?.tokenIds.includes(tokenId) && !otherNode?.tokenIds.includes(tokenId);
|
||||||
|
const previousMatchesOfDocument = dp[l - 1]?.get(documentId);
|
||||||
|
if (l !== 0 && !previousMatchesOfDocument) continue;
|
||||||
|
for (const documentOffset of offsets) {
|
||||||
|
const { start: currentStart, end: currentEnd } = documentOffset;
|
||||||
|
const contributeNextMatchingState = (previousState: IntermediateResult | undefined) => {
|
||||||
|
const nextMatchingMap = dp[r]!;
|
||||||
|
let nextMatchesOfDocument = nextMatchingMap.get(documentId);
|
||||||
|
if (!nextMatchesOfDocument) {
|
||||||
|
nextMatchesOfDocument = Object.create(null) as Record<number, IntermediateResult>;
|
||||||
|
nextMatchingMap.set(documentId, nextMatchesOfDocument);
|
||||||
|
}
|
||||||
|
const oldResult = nextMatchesOfDocument[currentEnd];
|
||||||
|
const inputOffset = { start: l, end: r + 1 };
|
||||||
|
const newResult: IntermediateResult = {
|
||||||
|
previousState,
|
||||||
|
firstTokenDocumentOffset: previousState?.firstTokenDocumentOffset ?? documentOffset,
|
||||||
|
rangeCount: !previousState ? 1
|
||||||
|
: (previousState.rangeCount + (hasNonEmptyCharacters(documentCodePoints[documentId]!, previousState.documentOffset.end, currentStart) ? 1 : 0)),
|
||||||
|
tokenCount: (previousState?.tokenCount ?? 0) + 1,
|
||||||
|
prefixMatchCount: (previousState?.prefixMatchCount ?? 0) + (isTokenPrefixMatching ? 1 : 0),
|
||||||
|
matchedTokenLength: (previousState?.matchedTokenLength ?? 0) + getSpanLength(documentOffset) *
|
||||||
|
Math.min(isTokenPrefixMatching ? getSpanLength(inputOffset) / tokenDefinitions[tokenId]!.codePointLength : Infinity, 1),
|
||||||
|
tokenId,
|
||||||
|
documentOffset,
|
||||||
|
inputOffset,
|
||||||
|
isTokenPrefixMatching,
|
||||||
|
};
|
||||||
|
nextMatchesOfDocument[currentEnd] = !oldResult || compareIntermediateResult(newResult, oldResult) < 0 ? newResult : oldResult;
|
||||||
|
};
|
||||||
|
if (l === 0) contributeNextMatchingState(undefined);
|
||||||
|
else for (const previousEnd in previousMatchesOfDocument) if (currentStart >= Number(previousEnd))
|
||||||
|
contributeNextMatchingState(previousMatchesOfDocument[previousEnd as unknown as number]!);
|
||||||
|
// Don't `break` here because keys of `previousMatchesOfDocument` are not essentially ordered
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build search results and sort documents
|
||||||
|
return [...dp[codePoints.length - 1]!.entries()].map<SearchResult>(([documentId, matches]) => {
|
||||||
|
const sortedMatches = Object.values(matches).map<CandidateResult>(match => {
|
||||||
|
const tokens: SearchResultToken[] = [];
|
||||||
|
// Build token list from backtracking
|
||||||
|
let state: IntermediateResult | undefined = match;
|
||||||
|
while (state) {
|
||||||
|
tokens.unshift({
|
||||||
|
definition: tokenDefinitions[state.tokenId]!,
|
||||||
|
documentOffset: state.documentOffset, inputOffset: state.inputOffset,
|
||||||
|
isTokenPrefixMatching: state.isTokenPrefixMatching,
|
||||||
|
});
|
||||||
|
state = state.previousState;
|
||||||
|
}
|
||||||
|
return { tokens, prefixMatchCount: match.prefixMatchCount, matchedTokenLength: match.matchedTokenLength, rangeCount: match.rangeCount };
|
||||||
|
}).sort(compareCandidateResult);
|
||||||
|
const bestMatchOfDocument = sortedMatches[0]!;
|
||||||
|
const documentText = documents[documentId]!;
|
||||||
|
const matchRatio = bestMatchOfDocument.matchedTokenLength / documentCodePoints[documentId]!.length;
|
||||||
|
const matchRatioLevel = Math.round(matchRatio * 5);
|
||||||
|
return {
|
||||||
|
documentId,
|
||||||
|
documentText,
|
||||||
|
documentCodePoints: documentCodePoints[documentId]!,
|
||||||
|
tokens: bestMatchOfDocument.tokens,
|
||||||
|
prefixMatchCount: bestMatchOfDocument.prefixMatchCount,
|
||||||
|
rangeCount: bestMatchOfDocument.rangeCount,
|
||||||
|
matchRatio,
|
||||||
|
matchRatioLevel,
|
||||||
|
};
|
||||||
|
}).sort(compareFinalResult);
|
||||||
|
};
|
||||||
|
|
||||||
|
// For debugging
|
||||||
|
export const inspectSearchResult = (resultDocument: SearchResult, htmlHighlight: boolean) => {
|
||||||
|
const { documentText, tokens, rangeCount, matchRatio, matchRatioLevel } = resultDocument;
|
||||||
|
const escapeHtml = (s: string) => s.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
||||||
|
const escapedText = htmlHighlight ? highlightSearchResult(resultDocument).map(part =>
|
||||||
|
typeof part === 'string' ? escapeHtml(part) : `<u><b>${escapeHtml(part.highlight)}</b></u>`).join('') : JSON.stringify(documentText);
|
||||||
|
const description = ` (${rangeCount} ranges, ${Math.round(matchRatio * 10000) / 10000} => L${matchRatioLevel})`;
|
||||||
|
return [
|
||||||
|
escapedText + (htmlHighlight ? `<code>${description}</code>` : description),
|
||||||
|
...tokens.map(token => {
|
||||||
|
let escapedTokenText = JSON.stringify(token.definition.text);
|
||||||
|
let escapedDocumentText = JSON.stringify([...documentText].slice(token.documentOffset.start, token.documentOffset.end).join(''));
|
||||||
|
if (htmlHighlight) {
|
||||||
|
escapedTokenText = escapeHtml(escapedTokenText);
|
||||||
|
escapedDocumentText = escapeHtml(escapedDocumentText);
|
||||||
|
}
|
||||||
|
const line = ` ${TokenType[token.definition.type]}: ${escapedTokenText} -> ${escapedDocumentText}${token.isTokenPrefixMatching ? ' (prefix match)' : ''}`;
|
||||||
|
return htmlHighlight ? `<code>${line}</code>` : line;
|
||||||
|
}),
|
||||||
|
'',
|
||||||
|
].join('\n');
|
||||||
|
};
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
import type { TrieNode } from '../common';
|
||||||
|
|
||||||
|
export const deserializeTrie = (data: number[]) => {
|
||||||
|
const nodes: TrieNode[] = [];
|
||||||
|
const getNode = (id: number) => nodes[id - 1] ??= { parent: undefined, children: new Map(), tokenIds: [], subTreeTokenIds: [] };
|
||||||
|
let currentId = 0;
|
||||||
|
for (let i = 0; i < data.length;) {
|
||||||
|
const node = getNode(++currentId);
|
||||||
|
const parentId = data[i++]!;
|
||||||
|
node.parent = parentId !== 0 ? getNode(parentId) : undefined;
|
||||||
|
|
||||||
|
let endOfChildren = i;
|
||||||
|
while (endOfChildren < data.length && data[endOfChildren]! > 0) endOfChildren++;
|
||||||
|
const numberOfChildren = (endOfChildren - i) / 2;
|
||||||
|
for (let j = i; j < i + numberOfChildren; j++) {
|
||||||
|
const codePoint = data[j]!;
|
||||||
|
const child = getNode(data[j + numberOfChildren]!);
|
||||||
|
node.children.set(codePoint, child);
|
||||||
|
}
|
||||||
|
i = endOfChildren;
|
||||||
|
|
||||||
|
if (data[i] === 0) i++; // No token IDs
|
||||||
|
else while (i < data.length && data[i]! < 0) node.tokenIds.push(-data[i++]! - 1);
|
||||||
|
}
|
||||||
|
const root = nodes[0]!;
|
||||||
|
|
||||||
|
// DFS to construct code point paths for each token
|
||||||
|
const tokenCodePoints = new Map<number, string[]>();
|
||||||
|
const currentCodePoints: string[] = [];
|
||||||
|
const dfsCodePoints = (node: TrieNode) => {
|
||||||
|
for (const tokenId of node.tokenIds) tokenCodePoints.set(tokenId, [...currentCodePoints]);
|
||||||
|
for (const [codePoint, child] of node.children.entries()) {
|
||||||
|
if (child.parent !== node) continue; // Skip grafted paths as these are not the canonical representation of the tokens
|
||||||
|
currentCodePoints.push(String.fromCodePoint(codePoint));
|
||||||
|
dfsCodePoints(child);
|
||||||
|
currentCodePoints.pop();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
dfsCodePoints(root);
|
||||||
|
|
||||||
|
// DFS to construct subTreeTokenIds for each node
|
||||||
|
const visitedNodes = new Set<TrieNode>();
|
||||||
|
const dfsSubTreeTokenIds = (node: TrieNode) => {
|
||||||
|
if (visitedNodes.has(node)) return node.subTreeTokenIds;
|
||||||
|
visitedNodes.add(node);
|
||||||
|
node.subTreeTokenIds = [...node.tokenIds, ...new Set([...node.children.values()].flatMap(child => dfsSubTreeTokenIds(child)))];
|
||||||
|
return node.subTreeTokenIds;
|
||||||
|
};
|
||||||
|
dfsSubTreeTokenIds(root);
|
||||||
|
|
||||||
|
return {
|
||||||
|
root,
|
||||||
|
tokenCodePoints,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export const getTrieNodeTokenIds = (node: TrieNode | undefined, includeSubTree: boolean) =>
|
||||||
|
(includeSubTree ? node?.subTreeTokenIds : node?.tokenIds) ?? [];
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ESNext",
|
||||||
|
"jsx": "preserve",
|
||||||
|
"lib": ["DOM", "DOM.Iterable", "ESNext", "WebWorker"],
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleResolution": "Bundler",
|
||||||
|
"noUncheckedIndexedAccess": true,
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"allowJs": true,
|
||||||
|
"strict": true,
|
||||||
|
"strictNullChecks": true,
|
||||||
|
"noEmit": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"isolatedModules": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"rootDir": ".",
|
||||||
|
"outDir": "dist"
|
||||||
|
},
|
||||||
|
"include": ["src/**/*.ts"],
|
||||||
|
"exclude": ["dist", "node_modules"]
|
||||||
|
}
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
import { defineConfig } from 'tsdown';
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
entry: [
|
||||||
|
'./src/index.ts',
|
||||||
|
'./src/searcher/index.ts',
|
||||||
|
'./src/indexer/index.ts',
|
||||||
|
'./src/common/index.ts',
|
||||||
|
],
|
||||||
|
dts: true,
|
||||||
|
unused: true,
|
||||||
|
fixedExtension: true,
|
||||||
|
unbundle: true,
|
||||||
|
sourcemap: true,
|
||||||
|
});
|
||||||
Generated
+6817
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,9 @@
|
|||||||
|
packages:
|
||||||
|
- packages/*
|
||||||
|
- apps/*
|
||||||
|
|
||||||
|
nodeLinker: hoisted
|
||||||
|
|
||||||
|
onlyBuiltDependencies:
|
||||||
|
- '@swc/core'
|
||||||
|
- unrs-resolver
|
||||||
Reference in New Issue
Block a user