Spaces:

nakamura196
/

genji_predict

Sleeping

App Files Files Community

nakamura196 commited on 23 days ago

Commit

0217f42

1 Parent(s): 3f6ca1a

feat: add nbdev

Browse files

Files changed (20) hide show

.github/workflows/deploy.yaml +14 -0
.github/workflows/test.yaml +7 -0
.gitignore +151 -1
LICENSE +201 -0
MANIFEST.in +5 -0
README copy.md +12 -0
app.py +7 -80
genji_predict/__init__.py +1 -0
genji_predict/_modidx.py +16 -0
genji_predict/core.py +108 -0
nbs/00_core.ipynb +167 -0
nbs/_quarto.yml +22 -0
nbs/index.ipynb +180 -0
nbs/nbdev.yml +9 -0
nbs/styles.css +37 -0
pyproject.toml +3 -0
requirements.txt +3 -1
settings.ini +45 -0
setup.py +64 -0
src/02_demo.ipynb +0 -0

.github/workflows/deploy.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: Deploy to GitHub Pages
+permissions:
+  contents: write
+  pages: write
+on:
+  push:
+    branches: [ "main", "master" ]
+  workflow_dispatch:
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps: [uses: fastai/workflows/quarto-ghp@master]

.github/workflows/test.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+name: CI
+on:  [workflow_dispatch, pull_request, push]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps: [uses: fastai/workflows/nbdev-ci@master]

.gitignore CHANGED Viewed

@@ -1,2 +1,152 @@
 .venv
-__pycache__

+_docs/
+_proc/
+*.bak
+.gitattributes
+.last_checked
+.gitconfig
+*.bak
+*.log
+*~
+~*
+_tmp*
+tmp*
+tags
+*.pkg
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+env/
+build/
+conda/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# dotenv
+.env
+# virtualenv
 .venv
+venv/
+ENV/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.vscode
+*.swp
+# osx generated files
+.DS_Store
+.DS_Store?
+.Trashes
+ehthumbs.db
+Thumbs.db
+.idea
+# pytest
+.pytest_cache
+# tools/trust-doc-nbs
+docs_src/.last_checked
+# symlinks to fastai
+docs_src/fastai
+tools/fastai
+# link checker
+checklink/cookies.txt
+# .gitconfig is now autogenerated
+.gitconfig
+# Quarto installer
+.deb
+.pkg
+# Quarto
+.quarto

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2022, fastai
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,5 @@

+include settings.ini
+include LICENSE
+include CONTRIBUTING.md
+include README.md
+recursive-exclude * __pycache__

README copy.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Genji Predict
+emoji: 👁
+colorFrom: purple
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.44.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -4,97 +4,24 @@
 類似テキスト検索を行うWebインターフェースを提供します。
 """
-import json
-import xml.etree.ElementTree as ET
 import gradio as gr
-from Levenshtein import ratio
-DATA_PATH = "./data.json"
-with open(DATA_PATH, "r", encoding="utf-8") as f:
-    documents_data = json.load(f)
-def predict(query, selected_vols, top_n=5):
-    """テキストの類似度を計算し、上位の結果を返す
-    Args:
-        query (str): 検索クエリテキスト
-        selected_vols (list): 検索対象の巻のリスト
-        top_n (int, optional): 返す結果の数. デフォルトは5
-    Returns:
-        list: スコア順にソートされた上位n件の検索結果
-    """
-    results = []
-    for doc in documents_data:
-        # 選択された巻のみを検索対象とする
-        if not selected_vols or str(doc["vol"]) in selected_vols:
-            score = ratio(query, doc["text"])
-            results.append({
-                "vol": doc["vol"],
-                "page": doc["page"],
-                "score": score,
-                "text": doc["text"]
-            })
-    results.sort(key=lambda x: x["score"], reverse=True)
-    top_results = results[:top_n]  # top_nで指定された件数だけを取得
-    return top_results
-def extract_text_from_lines(element):
-    """本文タイプの要素からテキストを抽出する"""
-    lines = element.findall(".//*[@type='本文']")
-    return ''.join(line.text for line in lines)
-def format_prediction_result(result):
-    """予測結果を 'vol-page' 形式にフォーマットする"""
-    first_result = result[0]
-    return f'{first_result["vol"]}-{first_result["page"]}'
-def search_similar_texts(query, selected_vols, top_n=5, xml_file=None):
-    """テキストの類似検索を実行する関数
-    Args:
-        query (str): 検索クエリテキスト
-        selected_vols (list): 検索対象の巻のリスト
-        top_n (int, optional): 返す結果の数. デフォルトは5
-        xml_file (gradio.File, optional): 比較対象のXMLファイル
-    Returns:
-        list: 検索結果のリスト。XMLファイル処理時は[predict_results]、
-             通常検索時は[top_results]を返す
-    """
     if xml_file is not None:
-        try:
-            # Gradioのファイルオブジェクトから名前を取得して直接ファイルを開く
-            xml_content = xml_file.name
-            tree = ET.parse(xml_content)
-            root = tree.getroot()
-            # ページ要素の取得
-            elements = root.findall(".//*[@type='page']")
-            # 予測実行
-            predict_results = {}
-            for i, element in enumerate(elements, 1):  # enumerate(elements, 1)で1から開始
-                text = extract_text_from_lines(element)
-                top_results = predict(text, selected_vols, 1)
-                predict_results[str(i)] = format_prediction_result(top_results)
-            return [predict_results]
-        except (ET.ParseError, FileNotFoundError, PermissionError) as e:
-            print(f"XMLファイルの処理中にエラーが発生しました: {str(e)}")
-            return [[], {}]
-    top_results = predict(query, selected_vols, top_n)
-    return [top_results] # , vol_percentages
 # Gradioインターフェースの作成
 demo = gr.Interface(

 類似テキスト検索を行うWebインターフェースを提供します。
 """
 import gradio as gr
+from genji_predict.core import ApiClient
+DATA_PATH = "./data.json"
+client = ApiClient(DATA_PATH)
+def search_similar_texts(query, selected_vols, top_n, xml_file):
+    xml_file_path = None
     if xml_file is not None:
+        xml_file_path = xml_file.name
+    return client.search_similar_texts(query, selected_vols, top_n=top_n, xml_file_path=xml_file_path)
 # Gradioインターフェースの作成
 demo = gr.Interface(

genji_predict/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.0.1"

genji_predict/_modidx.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Autogenerated by nbdev
+d = { 'settings': { 'branch': 'main',
+                'doc_baseurl': '/genji_predict',
+                'doc_host': 'https://nakamura196.github.io',
+                'git_url': 'https://github.com/nakamura196/genji_predict',
+                'lib_path': 'genji_predict'},
+  'syms': { 'genji_predict.core': { 'genji_predict.core.ApiClient': ('core.html#apiclient', 'genji_predict/core.py'),
+                                    'genji_predict.core.ApiClient.__init__': ('core.html#apiclient.__init__', 'genji_predict/core.py'),
+                                    'genji_predict.core.ApiClient.extract_text_from_lines': ( 'core.html#apiclient.extract_text_from_lines',
+                                                                                              'genji_predict/core.py'),
+                                    'genji_predict.core.ApiClient.format_prediction_result': ( 'core.html#apiclient.format_prediction_result',
+                                                                                               'genji_predict/core.py'),
+                                    'genji_predict.core.ApiClient.predict': ('core.html#apiclient.predict', 'genji_predict/core.py'),
+                                    'genji_predict.core.ApiClient.search_similar_texts': ( 'core.html#apiclient.search_similar_texts',
+                                                                                           'genji_predict/core.py')}}}

genji_predict/core.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Fill in a module description here"""
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
+# %% auto 0
+__all__ = ['ApiClient']
+# %% ../nbs/00_core.ipynb 3
+import gradio as gr
+from Levenshtein import ratio
+import json
+import xml.etree.ElementTree as ET
+from tqdm import tqdm
+# %% ../nbs/00_core.ipynb 4
+class ApiClient:
+    def __init__(self, data_path: str):
+        DATA_PATH = data_path
+        with open(DATA_PATH, "r", encoding="utf-8") as f:
+            documents_data = json.load(f)
+        self.documents_data = documents_data
+    def extract_text_from_lines(self, element):
+        """本文タイプの要素からテキストを抽出する"""
+        lines = element.findall(".//*[@type='本文']")
+        return ''.join(line.text for line in lines)
+    def format_prediction_result(self, result):
+        """予測結果を 'vol-page' 形式にフォーマットする"""
+        first_result = result[0]
+        return f'{first_result["vol"]}-{first_result["page"]}'
+    def search_similar_texts(self, query, selected_vols, top_n=5, xml_file_path=None):
+        """テキストの類似検索を実行する関数
+        Args:
+            query (str): 検索クエリテキスト
+            selected_vols (list): 検索対象の巻のリスト
+            top_n (int, optional): 返す結果の数. デフォルトは5
+            xml_file (gradio.File, optional): 比較対象のXMLファイル
+        Returns:
+            list: 検索結果のリスト。XMLファイル処理時は[predict_results]、
+                通常検索時は[top_results]を返す
+        """
+        if xml_file_path is not None:
+            try:
+                with open(xml_file_path, "r", encoding="utf-8") as f:
+                    xml_str = f.read()
+                root = ET.fromstring(xml_str)
+                # ページ要素の取得
+                elements = root.findall(".//*[@type='page']")
+                # 予測実行
+                predict_results = {}
+                for i, element in tqdm(enumerate(elements, 1)):
+                    text = self.extract_text_from_lines(element)
+                    top_results = self.predict(text, selected_vols, 1)
+                    predict_results[str(i)] = self.format_prediction_result(top_results)
+                return [predict_results]
+            except (ET.ParseError, FileNotFoundError, PermissionError) as e:
+                print(f"XMLファイルの処理中にエラーが発生しました: {str(e)}")
+                return [[], {}]
+        top_results = self.predict(query, selected_vols, top_n)
+        return [top_results] # , vol_percentages
+    def predict(self, query, selected_vols, top_n=5):
+        """テキストの類似度を計算し、上位の結果を返す
+        Args:
+            query (str): 検索クエリテキスト
+            selected_vols (list): 検索対象の巻のリスト
+            top_n (int, optional): 返す結果の数. デフォルトは5
+        Returns:
+            list: スコア順にソートされた上位n件の検索結果
+        """
+        results = []
+        for doc in self.documents_data:
+            # 選択された巻のみを検索対象とする
+            if not selected_vols or str(doc["vol"]) in selected_vols:
+                score = ratio(query, doc["text"])
+                results.append({
+                    "vol": doc["vol"],
+                    "page": doc["page"],
+                    "score": score,
+                    "text": doc["text"]
+                })
+        results.sort(key=lambda x: x["score"], reverse=True)
+        top_results = results[:top_n]  # top_nで指定された件数だけを取得
+        return top_results

nbs/00_core.ipynb ADDED Viewed

	@@ -0,0 +1,167 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# core\n",
+    "\n",
+    "> Fill in a module description here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| default_exp core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "from nbdev.showdoc import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "import gradio as gr\n",
+    "from Levenshtein import ratio\n",
+    "import json\n",
+    "import xml.etree.ElementTree as ET\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class ApiClient:\n",
+    "\n",
+    "\n",
+    "    def __init__(self, data_path: str):\n",
+    "        DATA_PATH = data_path\n",
+    "\n",
+    "        with open(DATA_PATH, \"r\", encoding=\"utf-8\") as f:\n",
+    "            documents_data = json.load(f)\n",
+    "\n",
+    "        self.documents_data = documents_data\n",
+    "\n",
+    "    def extract_text_from_lines(self, element):\n",
+    "        \"\"\"本文タイプの要素からテキストを抽出する\"\"\"\n",
+    "        lines = element.findall(\".//*[@type='本文']\")\n",
+    "        return ''.join(line.text for line in lines)\n",
+    "\n",
+    "    def format_prediction_result(self, result):\n",
+    "        \"\"\"予測結果を 'vol-page' 形式にフォーマットする\"\"\"\n",
+    "        first_result = result[0]\n",
+    "        return f'{first_result[\"vol\"]}-{first_result[\"page\"]}'\n",
+    "\n",
+    "\n",
+    "    def search_similar_texts(self, query, selected_vols, top_n=5, xml_file_path=None):\n",
+    "        \"\"\"テキストの類似検索を実行する関数\n",
+    "\n",
+    "        Args:\n",
+    "            query (str): 検索クエリテキスト\n",
+    "            selected_vols (list): 検索対象の巻のリスト\n",
+    "            top_n (int, optional): 返す結果の数. デフォルトは5\n",
+    "            xml_file (gradio.File, optional): 比較対象のXMLファイル\n",
+    "\n",
+    "        Returns:\n",
+    "            list: 検索結果のリスト。XMLファイル処理時は[predict_results]、\n",
+    "                通常検索時は[top_results]を返す\n",
+    "        \"\"\"\n",
+    "        if xml_file_path is not None:\n",
+    "            \n",
+    "            try:\n",
+    "                with open(xml_file_path, \"r\", encoding=\"utf-8\") as f:\n",
+    "                    xml_str = f.read()\n",
+    "                    \n",
+    "                root = ET.fromstring(xml_str)\n",
+    "                \n",
+    "                # ページ要素の取得\n",
+    "                elements = root.findall(\".//*[@type='page']\")\n",
+    "\n",
+    "                # 予測実行\n",
+    "                predict_results = {}\n",
+    "                for i, element in tqdm(enumerate(elements, 1)):\n",
+    "                    text = self.extract_text_from_lines(element)\n",
+    "                    top_results = self.predict(text, selected_vols, 1)\n",
+    "                    predict_results[str(i)] = self.format_prediction_result(top_results)\n",
+    "\n",
+    "                return [predict_results]\n",
+    "        \n",
+    "            except (ET.ParseError, FileNotFoundError, PermissionError) as e:\n",
+    "                print(f\"XMLファイルの処理中にエラーが発生しました: {str(e)}\")\n",
+    "                return [[], {}]\n",
+    "        \n",
+    "\n",
+    "        top_results = self.predict(query, selected_vols, top_n)\n",
+    "        \n",
+    "        return [top_results] # , vol_percentages\n",
+    "        \n",
+    "\n",
+    "    def predict(self, query, selected_vols, top_n=5):\n",
+    "        \"\"\"テキストの類似度を計算し、上位の結果を返す\n",
+    "\n",
+    "        Args:\n",
+    "            query (str): 検索クエリテキスト\n",
+    "            selected_vols (list): 検索対象の巻のリスト\n",
+    "            top_n (int, optional): 返す結果の数. デフォルトは5\n",
+    "\n",
+    "        Returns:\n",
+    "            list: スコア順にソートされた上位n件の検索結果\n",
+    "        \"\"\"\n",
+    "        results = []\n",
+    "        \n",
+    "        for doc in self.documents_data:\n",
+    "            # 選択された巻のみを検索対象とする\n",
+    "            if not selected_vols or str(doc[\"vol\"]) in selected_vols:\n",
+    "                score = ratio(query, doc[\"text\"])\n",
+    "                results.append({\n",
+    "                    \"vol\": doc[\"vol\"],\n",
+    "                    \"page\": doc[\"page\"],\n",
+    "                    \"score\": score,\n",
+    "                    \"text\": doc[\"text\"]\n",
+    "                })\n",
+    "\n",
+    "        results.sort(key=lambda x: x[\"score\"], reverse=True)\n",
+    "        top_results = results[:top_n]  # top_nで指定された件数だけを取得\n",
+    "\n",
+    "        return top_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "import nbdev; nbdev.nbdev_export()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

nbs/_quarto.yml ADDED Viewed

	@@ -0,0 +1,22 @@

+project:
+  type: website
+format:
+  html:
+    theme: cosmo
+    css: styles.css
+    toc: true
+    keep-md: true
+  commonmark: default
+website:
+  twitter-card: true
+  open-graph: true
+  repo-actions: [issue]
+  navbar:
+    background: primary
+    search: true
+  sidebar:
+    style: floating
+metadata-files: [nbdev.yml, sidebar.yml]

nbs/index.ipynb ADDED Viewed

	@@ -0,0 +1,180 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "from genji_predict.core import *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# genji_predict\n",
+    "\n",
+    "> "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This file will become your README and also the index of your documentation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Developer Guide"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you are new to using `nbdev` here are some useful pointers to get you started."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Install genji_predict in Development mode"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```sh\n",
+    "# make sure genji_predict package is installed in development mode\n",
+    "$ pip install -e .\n",
+    "\n",
+    "# make changes under nbs/ directory\n",
+    "# ...\n",
+    "\n",
+    "# compile to have changes apply to genji_predict\n",
+    "$ nbdev_prepare\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install latest from the GitHub [repository][repo]:\n",
+    "\n",
+    "```sh\n",
+    "$ pip install git+https://github.com/nakamura196/genji_predict.git\n",
+    "```\n",
+    "\n",
+    "or from [conda][conda]\n",
+    "\n",
+    "```sh\n",
+    "$ conda install -c nakamura196 genji_predict\n",
+    "```\n",
+    "\n",
+    "or from [pypi][pypi]\n",
+    "\n",
+    "\n",
+    "```sh\n",
+    "$ pip install genji_predict\n",
+    "```\n",
+    "\n",
+    "\n",
+    "[repo]: https://github.com/nakamura196/genji_predict\n",
+    "[docs]: https://nakamura196.github.io/genji_predict/\n",
+    "[pypi]: https://pypi.org/project/genji_predict/\n",
+    "[conda]: https://anaconda.org/nakamura196/genji_predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Documentation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Documentation can be found hosted on this GitHub [repository][repo]'s [pages][docs]. Additionally you can find package manager specific guidelines on [conda][conda] and [pypi][pypi] respectively.\n",
+    "\n",
+    "[repo]: https://github.com/nakamura196/genji_predict\n",
+    "[docs]: https://nakamura196.github.io/genji_predict/\n",
+    "[pypi]: https://pypi.org/project/genji_predict/\n",
+    "[conda]: https://anaconda.org/nakamura196/genji_predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How to use"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fill me in please! Don't forget code examples:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "1+1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

nbs/nbdev.yml ADDED Viewed

	@@ -0,0 +1,9 @@

+project:
+  output-dir: _docs
+website:
+  title: "genji_predict"
+  site-url: "https://nakamura196.github.io/genji_predict"
+  description: ""
+  repo-branch: main
+  repo-url: "https://github.com/nakamura196/genji_predict"

nbs/styles.css ADDED Viewed

	@@ -0,0 +1,37 @@

+.cell {
+  margin-bottom: 1rem;
+}
+.cell > .sourceCode {
+  margin-bottom: 0;
+}
+.cell-output > pre {
+  margin-bottom: 0;
+}
+.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre {
+  margin-left: 0.8rem;
+  margin-top: 0;
+  background: none;
+  border-left: 2px solid lightsalmon;
+  border-top-left-radius: 0;
+  border-top-right-radius: 0;
+}
+.cell-output > .sourceCode {
+  border: none;
+}
+.cell-output > .sourceCode {
+  background: none;
+  margin-top: 0;
+}
+div.description {
+  padding-left: 2px;
+  padding-top: 5px;
+  font-style: italic;
+  font-size: 135%;
+  opacity: 70%;
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[build-system]
+requires = ["setuptools>=64.0"]
+build-backend = "setuptools.build_meta"

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 levenshtein
-gradio==4.44.1

 levenshtein
+gradio==4.44.1
+nbdev
+tqdm

settings.ini ADDED Viewed

	@@ -0,0 +1,45 @@

+[DEFAULT]
+# All sections below are required unless otherwise specified.
+# See https://github.com/AnswerDotAI/nbdev/blob/main/settings.ini for examples.
+### Python library ###
+repo = genji_predict
+lib_name = %(repo)s
+version = 0.0.1
+min_python = 3.7
+license = apache2
+black_formatting = False
+### nbdev ###
+doc_path = _docs
+lib_path = genji_predict
+nbs_path = nbs
+recursive = True
+tst_flags = notest
+put_version_in_init = True
+### Docs ###
+branch = main
+custom_sidebar = False
+doc_host = https://%(user)s.github.io
+doc_baseurl = /%(repo)s
+git_url = https://github.com/%(user)s/%(repo)s
+title = %(lib_name)s
+### PyPI ###
+audience = Developers
+author = Satoru Nakamura
+author_email = na.kamura.1263@gmail.com
+copyright = 2025 onwards, %(author)s
+description =
+keywords = nbdev jupyter notebook python
+language = English
+status = 3
+user = nakamura196
+### Optional ###
+# requirements = fastcore pandas
+# dev_requirements =
+# console_scripts =
+# conda_user =
+# package_data =

setup.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from pkg_resources import parse_version
+from configparser import ConfigParser
+import setuptools, shlex
+assert parse_version(setuptools.__version__)>=parse_version('36.2')
+# note: all settings are in settings.ini; edit there, not here
+config = ConfigParser(delimiters=['='])
+config.read('settings.ini', encoding='utf-8')
+cfg = config['DEFAULT']
+cfg_keys = 'version description keywords author author_email'.split()
+expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
+for o in expected: assert o in cfg, "missing expected setting: {}".format(o)
+setup_cfg = {o:cfg[o] for o in cfg_keys}
+licenses = {
+    'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
+    'mit': ('MIT License', 'OSI Approved :: MIT License'),
+    'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'),
+    'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'),
+    'bsd3': ('BSD License', 'OSI Approved :: BSD License'),
+}
+statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
+    '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
+py_versions = '3.6 3.7 3.8 3.9 3.10 3.11 3.12'.split()
+requirements = shlex.split(cfg.get('requirements', ''))
+if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', ''))
+min_python = cfg['min_python']
+lic = licenses.get(cfg['license'].lower(), (cfg['license'], None))
+dev_requirements = (cfg.get('dev_requirements') or '').split()
+package_data = dict()
+pkg_data = cfg.get('package_data', None)
+if pkg_data:
+    package_data[cfg['lib_name']] =  pkg_data.split() # split as multiple files might be listed
+# Add package data to setup_cfg for setuptools.setup(..., **setup_cfg)
+setup_cfg['package_data'] = package_data
+setuptools.setup(
+    name = cfg['lib_name'],
+    license = lic[0],
+    classifiers = [
+        'Development Status :: ' + statuses[int(cfg['status'])],
+        'Intended Audience :: ' + cfg['audience'].title(),
+        'Natural Language :: ' + cfg['language'].title(),
+    ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []),
+    url = cfg['git_url'],
+    packages = setuptools.find_packages(),
+    include_package_data = True,
+    install_requires = requirements,
+    extras_require={ 'dev': dev_requirements },
+    dependency_links = cfg.get('dep_links','').split(),
+    python_requires  = '>=' + cfg['min_python'],
+    long_description = open('README.md', encoding='utf-8').read(),
+    long_description_content_type = 'text/markdown',
+    zip_safe = False,
+    entry_points = {
+        'console_scripts': cfg.get('console_scripts','').split(),
+        'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d']
+    },
+    **setup_cfg)

src/02_demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff