Spaces:

Shakshi3104
/

Cobalt

Sleeping

App Files Files Community

Shakshi3104 commited on Nov 9

Commit

133e909

•

1 Parent(s): e319108

[add] Implement notion_db to load contents from Notion database

Browse files

Files changed (1) hide show

model/data/notion_db.py +216 -0

model/data/notion_db.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import os
+from pathlib import Path
+import abc
+import pandas as pd
+from dotenv import load_dotenv
+import notion_client as nt
+from notion2md.exporter.block import StringExporter
+from loguru import logger
+class BaseNotionDatabase:
+    """
+    Notion DBからページのコンテンツを取り出すベースのクラス
+    """
+    def __init__(self):
+        load_dotenv()
+        self.notion_database_id = os.getenv("NOTION_DATABASE_ID")
+        self.integration_token = os.getenv("INTEGRATION_TOKEN")
+        # notion2mdの環境変数
+        os.environ["NOTION_TOKEN"] = os.getenv("INTEGRATION_TOKEN")
+        self.notion_client = nt.Client(auth=self.integration_token)
+    def load_database(self) -> list[dict]:
+        """
+        Notion DBのページ一覧を取得
+        Returns:
+        """
+        results = []
+        has_more = True
+        start_cursor = None
+        while has_more:
+            db = self.notion_client.databases.query(
+                **{
+                    "database_id": self.notion_database_id,
+                    "start_cursor": start_cursor
+                }
+            )
+            # 100件までしか1回に取得できない
+            # 100件以上ある場合 has_more = True
+            has_more = db["has_more"]
+            # 次のカーソル
+            start_cursor = db["next_cursor"]
+            # 取得結果
+            results += db["results"]
+        return results
+    @abc.abstractmethod
+    def load_content(self) -> list[dict]:
+        """
+        Notion DBのページの中身をdictで返す
+        Returns:
+        """
+        raise NotImplementedError
+class SakurapDB(BaseNotionDatabase):
+    def load_database(self) -> list[dict]:
+        """
+        Notion DBのページ一覧を取得
+        Returns:
+            results:
+                list[dict]
+        """
+        results = []
+        has_more = True
+        start_cursor = None
+        while has_more:
+            # "Rap詞 : 櫻井翔"がTrueのもののみ取得
+            db = self.notion_client.databases.query(
+                **{
+                    "database_id": self.notion_database_id,
+                    "filter": {
+                        "property": "Rap詞 : 櫻井翔",
+                        "checkbox": {
+                            "equals": True
+                        }
+                    },
+                    "start_cursor": start_cursor
+                }
+            )
+            # 100件までしか1回に取得できない
+            # 100件以上ある場合 has_more = True
+            has_more = db["has_more"]
+            # 次のカーソル
+            start_cursor = db["next_cursor"]
+            # 取得結果
+            results += db["results"]
+        return results
+    def __load_blocks(self, block_id: str) -> str:
+        """
+        Notionのページをプレーンテキストで取得する (Notion Official API)
+        Parameters
+        ----------
+        block_id:
+            str, Block ID
+        Returns
+        -------
+        texts:
+            str
+        """
+        block = self.notion_client.blocks.children.list(
+            **{
+                "block_id": block_id
+            }
+        )
+        # プレーンテキストを繋げる
+        def join_plain_texts():
+            text = [blck["paragraph"]["rich_text"][0]["plain_text"] if len(blck["paragraph"]["rich_text"])
+                    else "\n" for blck in block["results"]]
+            texts = "\n".join(text)
+            return texts
+        return join_plain_texts()
+    def load_content(self) -> list[dict]:
+        """
+        Notion DBのページの中身をdictで返す
+        Returns:
+            lyrics:
+                list[dict]
+        """
+        # DBのページ一覧を取得
+        db_results = self.load_database()
+        logger.info("🚦 [Notion] load database...")
+        # コンテンツ一覧
+        lyrics = []
+        logger.info("🚦 [Notion] start to load each page content ...")
+        # 各ページの処理
+        for result in db_results:
+            block_id = result["id"]
+            # rap_lyric = self.__load_blocks(block_id)
+            # Markdown形式でページを取得
+            rap_lyric = StringExporter(block_id=block_id).export()
+            # Markdownの修飾子を削除
+            rap_lyric = rap_lyric.replace("\n\n", "\n").replace("<br/>", "\n").replace("*", "")
+            lyrics.append(
+                {
+                    "title": result["properties"]["名前"]["title"][0]["plain_text"],
+                    "content": rap_lyric
+                }
+            )
+        logger.info("🚦 [Notion] Finish to load.")
+        return lyrics
+def fetch_sakurap_corpus(filepath: str, refetch=False) -> pd.DataFrame:
+    """
+    サクラップのコーパスを取得する
+    CSVファイルが存在しないときにNotionから取得する
+    Parameters
+    ----------
+    filepath:
+        str
+    refetch:
+        bool
+    Returns
+    -------
+    """
+    filepath = Path(filepath)
+    if not filepath.exists() or refetch:
+        # CSVファイルを保存するディレクトリが存在しなかったら作成する
+        if not filepath.parent.exists():
+            logger.info(f"🚦 [Notion] mkdir {str(filepath.parent)} ...")
+            filepath.parent.mkdir(parents=True, exist_ok=True)
+        logger.info("🚦 [Notion] fetch from Notion DB ...")
+        # dictを取得
+        rap_db = SakurapDB()
+        lyrics = rap_db.load_content()
+        lyrics_df = pd.DataFrame(lyrics)
+        lyrics_df.to_csv(filepath, index=False)
+    else:
+        logger.info("🚦 [Notion] load CSV file.")
+        lyrics_df = pd.read_csv(filepath)
+    return lyrics_df
+if __name__ == "__main__":
+    sakurap_db = SakurapDB()
+    lyrics = sakurap_db.load_content()