249 lines
6.7 KiB
Python
Executable File
249 lines
6.7 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
"""
|
|
Processes PGN game data into a tsv format suitable for training.
|
|
|
|
Output columns:
|
|
- FEN (for reference)
|
|
- ALL 768-bit binary string representing the position
|
|
- Evaluation (centipawns) from white perspective
|
|
- Result of the game (-1, 0, 1)
|
|
|
|
This script depends on the `chess` package.
|
|
Install it, or run this script using `pipx run process_pgn_data.py`.
|
|
The script also depends on the chess_inator engine for analysis and filtering.
|
|
"""
|
|
|
|
# /// script
|
|
# requires-python = ">=3.11"
|
|
# dependencies = [
|
|
# "chess",
|
|
# ]
|
|
# ///
|
|
|
|
import argparse
|
|
from asyncio import Queue, TaskGroup, create_task, run, sleep
|
|
import logging
|
|
import datetime
|
|
import multiprocessing
|
|
import gzip
|
|
import csv
|
|
|
|
import chess
|
|
import chess.engine
|
|
from typing import AsyncIterator, Literal
|
|
from chess import pgn
|
|
from pathlib import Path
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--log",
|
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
default="INFO",
|
|
help="Sets log level.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--engine",
|
|
help="Set the file path of the chess_inator engine used to analyze the positions.",
|
|
type=Path,
|
|
)
|
|
parser.add_argument(
|
|
"--max-workers",
|
|
help="Max concurrent workers to analyse games with (limit this to your hardware thread count).",
|
|
default=min(4, multiprocessing.cpu_count()),
|
|
type=int,
|
|
)
|
|
parser.add_argument(
|
|
"--preserve-partial",
|
|
action="store_true",
|
|
help="Keep output files that have not been fully written. These files may confuse this script when resuming operations.",
|
|
)
|
|
parser.add_argument("files", nargs="+", type=Path)
|
|
args = parser.parse_args()
|
|
|
|
|
|
logging.basicConfig(level=getattr(logging, str.upper(args.log)))
|
|
|
|
|
|
"""Skip these many plies from the start (avoid training on opening)."""
|
|
SKIP_PLIES: int = 20
|
|
|
|
"""Time limit in seconds for each position to be analyzed."""
|
|
TIME_LIMIT: float = 3
|
|
|
|
|
|
output_queue: Queue[tuple[str, str, int, Literal[-1, 0, 1]]] = Queue()
|
|
|
|
|
|
# stats for progress
|
|
completed = 0
|
|
discarded = 0
|
|
current_outp: Path | None = None
|
|
start_time = datetime.datetime.now()
|
|
|
|
|
|
async def load_games(file: Path):
|
|
"""Load a PGN file and divide up the games for the workers to process."""
|
|
with open(file) as f:
|
|
while game := pgn.read_game(f):
|
|
yield game
|
|
|
|
|
|
async def worker(game_generator: AsyncIterator[pgn.Game]) -> None:
|
|
"""
|
|
Single worker that analyzes whole games.
|
|
|
|
Code pattern taken from https://stackoverflow.com/a/54975674.
|
|
|
|
Puts rows of output into a global queue.
|
|
"""
|
|
transport, engine = await chess.engine.popen_uci(args.engine)
|
|
await engine.configure(dict(NNUETrainInfo="true"))
|
|
|
|
async for game in game_generator:
|
|
wdl: int | None = None
|
|
|
|
match game.headers["Result"]:
|
|
case "1-0":
|
|
wdl = 1
|
|
case "0-1":
|
|
wdl = -1
|
|
case "1/2-1/2":
|
|
wdl = 0
|
|
case other_result:
|
|
logging.error("invalid 'Result' header: '%s'", other_result)
|
|
continue
|
|
|
|
board = game.board()
|
|
|
|
skipped = 0
|
|
|
|
logging.info(
|
|
"Processing game %s, %s (%s) between %s as White and %s as Black.",
|
|
game.headers["Event"],
|
|
game.headers["Site"],
|
|
game.headers["Date"],
|
|
game.headers["White"],
|
|
game.headers["Black"],
|
|
)
|
|
|
|
for move in game.mainline_moves():
|
|
board.push(move)
|
|
if skipped < SKIP_PLIES:
|
|
skipped += 1
|
|
continue
|
|
result = await engine.play(
|
|
board,
|
|
chess.engine.Limit(time=TIME_LIMIT),
|
|
info=chess.engine.INFO_ALL,
|
|
game=game,
|
|
)
|
|
|
|
info_str = result.info.get("string")
|
|
if not info_str:
|
|
raise RuntimeError("Could not analyze position with engine.")
|
|
(name, quiet, eval_abs, tensor) = info_str.split()
|
|
if not name == "NNUETrainInfo":
|
|
raise RuntimeError(f"Unexpected output from engine: {info_str}")
|
|
|
|
if quiet == "non-quiet":
|
|
global discarded
|
|
discarded += 1
|
|
logging.debug("discarded as non-quiet: '%s'", board.fen())
|
|
continue
|
|
elif quiet != "quiet":
|
|
raise RuntimeError(f"Unexpected output from engine: {info_str}")
|
|
|
|
await output_queue.put((board.fen(), tensor, int(eval_abs), wdl))
|
|
|
|
await engine.quit()
|
|
|
|
|
|
async def analyse_games(file: Path):
|
|
"""Task that manages reading PGNs and analyzing them."""
|
|
games_generator = load_games(file)
|
|
|
|
async with TaskGroup() as tg:
|
|
worker_count: int = min(args.max_workers, multiprocessing.cpu_count())
|
|
logging.info("Using %d concurrent worker tasks.", worker_count)
|
|
for i in range(worker_count):
|
|
tg.create_task(worker(games_generator))
|
|
|
|
|
|
async def output_rows(outp_file: Path):
|
|
"""TSV writer task."""
|
|
|
|
with gzip.open(outp_file, "wt") as f:
|
|
writer = csv.writer(f, delimiter="\t")
|
|
while True:
|
|
row = await output_queue.get()
|
|
writer.writerow(row)
|
|
output_queue.task_done()
|
|
global completed
|
|
completed += 1
|
|
|
|
|
|
async def status_logger():
|
|
"""Periodically print status."""
|
|
while True:
|
|
await sleep(5)
|
|
logging.info(
|
|
"Completed %d rows in %f seconds. Discarded %d non-quiet positions.",
|
|
completed,
|
|
(datetime.datetime.now() - start_time).total_seconds(),
|
|
discarded,
|
|
)
|
|
|
|
|
|
async def main():
|
|
status_task = create_task(status_logger())
|
|
|
|
outp_dir = Path("train_data")
|
|
outp_dir.mkdir(exist_ok=True)
|
|
|
|
any_file = False
|
|
skipped = False
|
|
|
|
for file in args.files:
|
|
file: Path
|
|
|
|
outp_file = outp_dir / file.with_suffix(".tsv.gz").name
|
|
|
|
if outp_file.exists():
|
|
skipped = True
|
|
continue
|
|
|
|
any_file = True
|
|
|
|
if skipped:
|
|
logging.info("Resuming at file '%s'.", file)
|
|
skipped = False
|
|
else:
|
|
logging.info("Reading file '%s'.", file)
|
|
|
|
global current_outp
|
|
current_outp = outp_file
|
|
|
|
output_task = create_task(output_rows(outp_file))
|
|
analyse_task = create_task(analyse_games(file))
|
|
await analyse_task
|
|
output_task.cancel()
|
|
|
|
if not any_file:
|
|
logging.warning("Nothing to do. All input files have outputs already.")
|
|
|
|
status_task.cancel()
|
|
|
|
|
|
try:
|
|
run(main())
|
|
except KeyboardInterrupt:
|
|
logging.critical("shutting down.")
|
|
if current_outp and not args.preserve_partial:
|
|
logging.critical("discarding partial output file %s", current_outp)
|
|
current_outp.unlink()
|