#!/usr/bin/env python

"""
Batch PGN data into files, since the training data pipeline can't resume processing within a single file.
"""

# /// script
# requires-python = ">=3.12"
# dependencies = [
#   "chess",
# ]
# ///

from typing import Iterator
import chess.pgn
import argparse
import itertools

from pathlib import Path

parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+", type=Path)
parser.add_argument("--batch-size", type=int, help="Number of games to save in each output file. Set this to two to four times the amount of concurrent workers used in the processing step.", default=8)
parser.add_argument("--output-folder", type=Path, help="Folder to save batched games in.", default=Path("batches"))
args = parser.parse_args()

def generate_games_in_file(path: Path) -> Iterator[chess.pgn.Game]:
    """Read games from a single PGN file."""
    with open(path) as f:
        while game := chess.pgn.read_game(f):
            game.headers["PGNPath"] = str(path)
            yield game

def generate_games() -> Iterator[chess.pgn.Game]:
    """Read games from all files."""
    for path in args.files:
        yield from generate_games_in_file(path)

def batch_games():
    """Write games in batches."""
    output_folder: Path = args.output_folder
    output_folder.mkdir(exist_ok=True)
    for idx, batch in enumerate(itertools.batched(generate_games(), args.batch_size)):
        with (output_folder / f"batch{idx}.pgn").open("w") as f:
            for game in batch:
                f.write(str(game) + "\n\n")

batch_games()