青空文庫 - 外字コードの見直しと全自動化

(2025-04-25)

新しい作家の作品をライブラリに追加する場合、その度に面倒なことをしなければなりません。

とても面倒なプロセスなので全自動化することにしました。

python でも可能だと思いますが、やはり php の方が簡単なので php で chatGPT に作ってもらいました。

外字コードの修正

青空文庫の html ファイルの中にある外字コードを本物の漢字に変換するために外字コード表を作ったのですが、それがすべて変換されていないことに気づきました。

漢字コードの 2-01-04 は「丒」という字ですが、私のデータベースではそのコードは「2-1-4」となっていました。なのでこのコードを変換します。

UPDATE `JIS X 0213`
SET code = CONCAT(
    SUBSTRING_INDEX(code, '-', 1), '-',
    LPAD(SUBSTRING_INDEX(SUBSTRING_INDEX(code, '-', -2), '-', 1), 2, '0'), '-',
    LPAD(SUBSTRING_INDEX(code, '-', -1), 2, '0')
)
WHERE code REGEXP '^[0-9]+-[0-9]+-[0-9]+$';

ダウンロードした html ファイルの処理プロセス

ダウンロードした html ファイルは、すでに utf-8 で html5 に変換されていますが、外字コードはそのまま html ファイル内に書いてあります。

これを本当の漢字に置換する作業です。

html ファイルのインポート
外字のタグを抽出
タグからコードを抽出
JISコードから漢字を更新
タグを漢字に置換
HTMLを出力

すべて外部ファイル化します。

AozoraImporter.php

<?php
class AozoraImporter {
    private PDO $pdo;
    private string $baseDir;

    public function __construct(PDO $pdo, string $baseDir) {
        $this->pdo = $pdo;
        $this->baseDir = rtrim($baseDir, '/') . '/';
    }

    public function import(): void {
        $authors = $this->getAuthorDirs();

        foreach ($authors as $author) {
            $files = $this->getHtmlFiles($author);

            foreach ($files as $file) {
                $title = pathinfo($file, PATHINFO_FILENAME);
                $body = $this->extractBody($this->baseDir . "$author/$file");

                $this->upsertWork($author, $title, $body);
                echo "登録完了: $author - $title<br>";
            }
        }
    }

    private function getAuthorDirs(): array {
        return array_filter(scandir($this->baseDir), function($dir) {
            return is_dir($this->baseDir . $dir) && $dir !== '.' && $dir !== '..';
        });
    }

    private function getHtmlFiles(string $author): array {
        $dir = $this->baseDir . $author . '/';
        return array_filter(scandir($dir), function($file) use ($dir) {
            return is_file($dir . $file) && pathinfo($file, PATHINFO_EXTENSION) === 'html';
        });
    }

    private function extractBody(string $filePath): string {
        $html = file_get_contents($filePath);
        preg_match('/<body[^>]*>(.*?)<\/body>/us', $html, $matches);
        return $matches[1] ?? '';
    }

    private function upsertWork(string $author, string $title, string $content): void {
        $stmt = $this->pdo->prepare("
            INSERT INTO works (author, title, content)
            VALUES (?, ?, ?)
            ON DUPLICATE KEY UPDATE content = VALUES(content)
        ");
        $stmt->execute([$author, $title, $content]);
    }
}

// ===== 実行コード =====
$pdo = new PDO('mysql:host=localhost;dbname=aozora;charset=utf8mb4', 'root', 'pass');
$importer = new AozoraImporter($pdo, '../aozora/shelf');
$importer->import();

GaijiExtractor.php

<?php
class GaijiExtractor {
    private PDO $pdo;
    private array $cache = [];

    public function __construct(PDO $pdo) {
        $this->pdo = $pdo;
    }

    public function extractFromWorks(): void {
        ini_set('memory_limit', '1024M');
        echo ('here') ;

        $stmt = $this->pdo->query("SELECT id, content FROM works");
        $works = $stmt->fetchAll(PDO::FETCH_ASSOC);

        foreach ($works as $work) {
            try {
                $this->processContent($work['content']);
            } catch (Throwable $e) {
                echo "エラー: " . $e->getMessage() . "\n";
            }
        }
        echo "works.content から gaiji を抽出して保存しました。\n";
    }

    private function processContent(string $content): void {
        preg_match_all('/<img[^>]*class="gaiji"[^>]*>/i', $content, $matches);

        foreach ($matches[0] as $tag) {
            if (!isset($this->cache[$tag])) {
                $stmt = $this->pdo->prepare("INSERT IGNORE INTO gaiji (tag, kanji) VALUES (?, '')");
                $stmt->execute([$tag]);
                $this->cache[$tag] = true;
            }
        }
    }

}

GaijiCodeExtractor.php

<?php
class GaijiCodeExtractor {
    private PDO $pdo;

    public function __construct(PDO $pdo) {
        $this->pdo = $pdo;
    }

    // タグからコードを抽出して更新
    public function extractAndSaveCode(): void {
        try {
            // データを取得
            $stmt = $this->pdo->query("SELECT id, tag FROM gaiji WHERE tag IS NOT NULL");
            $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);

            // 更新用ステートメント
            $updateStmt = $this->pdo->prepare("UPDATE gaiji SET code = :code WHERE id = :id");

            foreach ($rows as $row) {
                $id = $row['id'];
                $tag = $row['tag'];

                // 正規表現でJISコード（1-13-21など）を抽出
                if (preg_match('/(\d{1,2}-\d{1,2}-\d{1,2})/', $tag, $matches)) {
                    $code = $matches[1];

                    // codeカラムを更新
                    $updateStmt->execute([
                        ':code' => $code,
                        ':id' => $id
                    ]);
                }
            }

            echo "✅ code カラムの更新が完了しました！\n";

        } catch (PDOException $e) {
            echo "❌ エラー: " . $e->getMessage() . "\n";
        }
    }
}

GaijiKanjiUpdater.php

<?php
class GaijiKanjiUpdater {
    private PDO $pdo;

    public function __construct(PDO $pdo) {
        $this->pdo = $pdo;
    }

    // codeからkanjiを検索して更新する
    public function updateKanjiFromCode(): void {
        try {
            // gaijiテーブルからcodeが設定されている行を取得
            $stmt = $this->pdo->query("SELECT id, code FROM gaiji WHERE code IS NOT NULL");
            $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);

            // 更新用ステートメント
            $updateStmt = $this->pdo->prepare("UPDATE gaiji SET kanji = :kanji WHERE id = :id");

            foreach ($rows as $row) {
                $id = $row['id'];
                $code = $row['code'];

                // JIS X 0213テーブルからcodeに対応するkanjiを検索
                $kanji = $this->getKanjiByCode($code);

                if ($kanji) {
                    // kanjiが見つかれば更新
                    $updateStmt->execute([
                        ':kanji' => $kanji,
                        ':id' => $id
                    ]);
                }
            }

            echo "✅ kanji カラムの更新が完了しました！\n";

        } catch (PDOException $e) {
            echo "❌ エラー: " . $e->getMessage() . "\n";
        }
    }

    // JIS X 0213テーブルからcodeに対応するkanjiを取得
    private function getKanjiByCode(string $code): ?string {
        // JIS X 0213 テーブルからcodeに対応するkanjiを取得
        $stmt = $this->pdo->prepare("SELECT kanji FROM `JIS X 0213` WHERE code = :code LIMIT 1");
        $stmt->execute([':code' => $code]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result['kanji'] ?? null;  // kanjiがあれば返し、なければnull
    }
}

GaijiReplacer.php

<?php
class GaijiReplacer {
    private PDO $pdo;

    public function __construct(PDO $pdo) {
        $this->pdo = $pdo;
    }

    public function replaceTagsWithKanji(): void {
        // 変換対象の gaiji を取得
        $gaijiList = $this->pdo->query("
            SELECT tag, kanji FROM gaiji
            WHERE tag IS NOT NULL AND kanji IS NOT NULL
        ")->fetchAll(PDO::FETCH_ASSOC);

        // 準備
        $selectStmt = $this->pdo->prepare("SELECT id, content FROM works WHERE content LIKE :tag");
        $updateStmt = $this->pdo->prepare("UPDATE works SET content = :newContent WHERE id = :id");

        foreach ($gaijiList as $gaiji) {
            $tag = $gaiji['tag'];
            $kanji = $gaiji['kanji'];

            // LIKE検索用エスケープ
            $escapedTag = addcslashes($tag, '%_');
            $likeTag = "%$escapedTag%";

            $selectStmt->execute([':tag' => $likeTag]);
            $works = $selectStmt->fetchAll(PDO::FETCH_ASSOC);

            foreach ($works as $work) {
                $updatedContent = str_replace($tag, $kanji, $work['content']);

                if ($updatedContent !== $work['content']) {
                    $updateStmt->execute([
                        ':newContent' => $updatedContent,
                        ':id' => $work['id']
                    ]);

                    echo "✅ 置換: works.id = {$work['id']} に {$kanji} を挿入<br>";
                }
            }
        }

        echo "✨ 全置換完了！<br>";
    }
}

HtmlExporter.php

<?php
class HtmlExporter {
    private PDO $pdo;
    private string $outputDir;

    public function __construct(PDO $pdo, string $outputDir = 'output') {
        $this->pdo = $pdo;
        $this->outputDir = rtrim($outputDir, '/');
    }

    public function exportAll(): void {
        $gaijiMap = $this->loadGaijiMap();
        $works = $this->loadWorks();

        foreach ($works as $work) {
            $html = $this->generateHtml($work, $gaijiMap);
            $this->saveHtml($work, $html);
        }

        echo "✅ HTML出力が完了しました！\n";
    }

    private function loadGaijiMap(): array {
        $stmt = $this->pdo->query("SELECT tag, kanji FROM gaiji WHERE tag IS NOT NULL AND kanji IS NOT NULL");
        $gaiji = [];

        while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) {
            $gaiji[$row['tag']] = $row['kanji'];
        }

        return $gaiji;
    }

    private function loadWorks(): array {
        $stmt = $this->pdo->query("SELECT id, author, title, content FROM works");
        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    private function generateHtml(array $work, array $gaijiMap): string {
        $content = $work['content'];

        foreach ($gaijiMap as $tag => $kanji) {
            $content = str_replace($tag, $kanji, $content);
        }

        return <<<HTML
<!DOCTYPE html>
<html lang="ja">
<head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <link rel="stylesheet" href="../../viewer/css/style.css" />
    <title>{$work['author']} {$work['title']}</title>
    <script src="../../jquery-1.4.2.min.js"></script>
    <meta name="DC.Title" content="{$work['title']}" />
    <meta name="DC.Creator" content="{$work['author']}" />
    <meta name="DC.Publisher" content="青空文庫" />
</head>
<body>
{$content}
</body>
</html>
HTML;
    }

    private function sanitizeFileName(string $string): string {
        $string = mb_convert_kana($string, 'as');
        return preg_replace('/[\/:*?"<>|]/u', '_', trim($string));
    }

    private function saveHtml(array $work, string $html): void {
        $authorDir = "{$this->outputDir}/" . $this->sanitizeFileName($work['author']);
        if (!is_dir($authorDir)) {
            mkdir($authorDir, 0777, true);
        }

        $filePath = "{$authorDir}/" . $this->sanitizeFileName($work['title']) . ".html";
        file_put_contents($filePath, $html);
        echo "📄 書き出し: {$filePath}\n";
    }
}

メインプログラム

<?php
require_once 'AozoraImporter.php';
require_once 'GaijiExtractor.php';
require_once 'GaijiCodeExtractor.php';
require_once 'GaijiKanjiUpdater.php';
require_once 'GaijiReplacer.php';
require_once 'HtmlExporter.php';

$pdo = new PDO('mysql:host=localhost;dbname=aozora;charset=utf8mb4', 'root', 'pass');
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);

// ① インポート
$importer = new AozoraImporter($pdo, '../aozora/shelf');
$importer->import();

// ② 外字のタグを抽出
$extractor = new GaijiExtractor($pdo);
$extractor->extractFromWorks();

// ③ タグからコードを抽出
$gaijiExtractor = new GaijiCodeExtractor($pdo);
$gaijiExtractor->extractAndSaveCode();

// ④ JISコードから漢字を更新
$kanjiUpdater = new GaijiKanjiUpdater($pdo);
$kanjiUpdater->updateKanjiFromCode();

// ⑤ タグを漢字に置換
$gaijiUpdater = new GaijiKanjiUpdater($pdo);
$gaijiUpdater->updateKanjiFromCode();

// ⑥ HTMLを出力
$exporter = new HtmlExporter($pdo);
$exporter->exportAll();

echo "✅ 全処理完了\n";

これを実行すると、同じディレクトリに「output」フォルダが作成され、その中に処理した html ファイルが保存されます。

データベースの works テーブルを空にして実行

新しくダウンロードした html ファイルを /var/www/html/aozora/shelf に置いて、上記プログラムを実行すると数十秒で新しい html ファイルが作成されます。

ただし、実行する前に aozora データベースの works を空にしておく必要があります。そうしないととても時間がかかります。

作家インデックスの作成

作家名でソートするために作家インデックスを作ります。これは php では無理なので python で実行します。

import MeCab
import json
import os

# MeCabの設定
def get_yomi(text):
    tagger = MeCab.Tagger("-Ochasen -r /etc/mecabrc")
    node = tagger.parseToNode(text)
    yomi = []

    while node:
        features = node.feature.split(",")
        if features[0] == "名詞" and len(features) > 7:
            yomi.append(features[7])  # 音読みや訓読みを取得
        node = node.next

    return ''.join(yomi)  # 全部の読みを結合して返す

# ディレクトリ内のファイルタイトルを取得して読みを生成
def generate_yomi_for_directory(directory):
    works_with_yomi = []

    # ディレクトリ内のファイルを取得
    for filename in os.listdir(directory):
        if filename.endswith(""):
            title = os.path.splitext(filename)[0]  # 拡張子を取り除く
            yomi = get_yomi(title)
            works_with_yomi.append({'title': title, 'yomi': yomi})

    return works_with_yomi

# 結果をJSONに保存
def save_to_json(data, filename='/home/mituo/aozora/authors/names.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# 使用例
directory = '/home/mituo/aozora/shelf'  # 本棚のディレクトリパス
works_with_yomi = generate_yomi_for_directory(directory)
save_to_json(works_with_yomi)

作家ごとの作品のインデックス

閲覧したい作品に素早くアクセスするために、作家の作品ごとのインデックスを作成します。

import MeCab
import json
import os

import MeCab

def get_yomi(text):
    import MeCab, os
    os.environ["MECABRC"] = "/etc/mecabrc"
    tagger = MeCab.Tagger("-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")
    node = tagger.parseToNode(text)
    yomi = []

    while node:
        if node.stat != MeCab.MECAB_BOS_NODE and node.stat != MeCab.MECAB_EOS_NODE:
            features = node.feature.split(",")
            yomi.append(features[7] if len(features) > 7 else node.surface)
        node = node.next

    return ''.join(yomi)



# 作家ディレクトリ内のHTMLファイルを処理
def generate_yomi_for_directory(directory):
    works_with_yomi = []

    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            title = os.path.splitext(filename)[0]  # 拡張子を除いたファイル名が作品名
            try:
                yomi = get_yomi(title)
                works_with_yomi.append({'title': title, 'yomi': yomi})
            except Exception as e:
                print(f"Error processing {title}: {e}")

    return works_with_yomi

# 作家ごとに処理
def process_all_authors(base_dir, output_base):
    for entry in os.scandir(base_dir):
        if entry.is_dir():
            author_name = entry.name
            author_path = entry.path
            print(f"Processing {author_name}...")

            works = generate_yomi_for_directory(author_path)

            output_dir = os.path.join(output_base, author_name)
            os.makedirs(output_dir, exist_ok=True)
            output_file = os.path.join(output_dir, 'names.json')

            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(works, f, ensure_ascii=False, indent=4)

# 使用例
shelf_dir = '/home/mituo/aozora/shelf'  # 作家が格納されているディレクトリ
output_dir = '/home/mituo/aozora/authors'  # 出力先ディレクトリ
process_all_authors(shelf_dir, output_dir)

作品名から古い著者名を削除

場合によっては作品名に古い名前の著者名が付くことがあるので、それをシェルスクリプトで削除します。

#!/bin/bash

TARGET_DIR="泉鏡花"

for file in "$TARGET_DIR"/*; do
  new_name=$(echo "$file" | sed 's/泉鏡太郎//g')

  if [[ "$file" != "$new_name" ]]; then
    mv "$file" "$new_name"
    echo "ファイル名を変更しました: $file → $new_name"
  fi
done

scp でアップロード

後は scp でアップロードできます。

面倒な部分はほとんど自動化して、かなり操作は簡単になったと思っています。

しかし現在のところ 5,000 近い作品がアップされているので、これ以上は要らない感じがしています。