chess_inator/nnue/s1_batch_pgn_data.py

49 lines
1.5 KiB
Python
Executable File

#!/usr/bin/env python
"""
Batch PGN data into files, since the training data pipeline can't resume processing within a single file.
"""
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "chess",
# ]
# ///
from typing import Iterator
import chess.pgn
import argparse
import itertools
from pathlib import Path
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+", type=Path)
parser.add_argument("--batch-size", type=int, help="Number of games to save in each output file. Set this to two to four times the amount of concurrent workers used in the processing step.", default=8)
parser.add_argument("--output-folder", type=Path, help="Folder to save batched games in.", default=Path("batches"))
args = parser.parse_args()
def generate_games_in_file(path: Path) -> Iterator[chess.pgn.Game]:
"""Read games from a single PGN file."""
with open(path) as f:
while game := chess.pgn.read_game(f):
game.headers["PGNPath"] = str(path)
yield game
def generate_games() -> Iterator[chess.pgn.Game]:
"""Read games from all files."""
for path in args.files:
yield from generate_games_in_file(path)
def batch_games():
"""Write games in batches."""
output_folder: Path = args.output_folder
output_folder.mkdir(exist_ok=True)
for idx, batch in enumerate(itertools.batched(generate_games(), args.batch_size)):
with (output_folder / f"batch{idx}.pgn").open("w") as f:
for game in batch:
f.write(str(game) + "\n\n")
batch_games()