shigureni.dev

読みを統計に変える — MCTS

ミニマックスは盤面を評価関数で点数化する。囲碁のように何が良い局面か言葉にしづらいゲームでは、その評価関数が書けない。評価関数を捨てて、ランダムに最後まで打ち切る試行を何度も回し、勝率の高い手を統計で選ぶのがモンテカルロ木探索、MCTS。

MCTS は四つの段を繰り返す。選択（根から、UCT が高い子をたどって葉まで降りる）、展開（葉に新しい子を生やす）、シミュレーション（そこからランダムに決着まで打って勝敗を得る）、逆伝播（結果を根まで持ち上げて通った各ノードの訪問数と勝数を加える）。子の選び方が UCT（Upper Confidence bound for Trees）で、勝率 w/n（活用）に未訪問ほど大きくなる項 C·√(ln N / n)（探索）を足した和が最大の子を選ぶ。C が活用と探索の釣り合いを決める定数。試行を打ち切ったその瞬間でも、訪問数が最多の子を最善手として返せる。Rémi Coulom が 2006 年に提唱し、評価関数を書かずに囲碁 AI を一段強くした。

子の有望さは UCT 一本で測る。一度も訪れていない子は 1e9 を返して必ず先に選ばせ、訪問済みは勝率と探索項の和を返す。

const uct = (childWins, childVisits, parentVisits, explore) => {
  if (childVisits === 0) return 1e9 // 未訪問の子を最優先で開く
  const exploit = childWins / childVisits // 勝率（活用）
  const search = explore * Math.sqrt(Math.log(parentVisits + 1) / childVisits) // 訪問が薄いほど大（探索）
  return exploit + search
}

四つの段を一回の iterate に詰める。根から uct が最大の子をたどって葉へ降り（選択）、葉に子を生やし（展開）、ランダムな報酬を作り（シミュレーションの代わり）、その値を親へ遡って各ノードの訪問数と勝数に足す（逆伝播）。深さごとにノードを横へ並べ、訪問数を半径に写すと木が育つ。報酬を一様乱数にしているので、どの枝も勝率はならされ、訪問の多い枝だけが太る。

const uct = (childWins, childVisits, parentVisits, explore) => {
  if (childVisits === 0) return 1e9
  const exploit = childWins / childVisits
  const search = explore * Math.sqrt(Math.log(parentVisits + 1) / childVisits)
  return exploit + search
}

export default (c) => {
  const w = c.canvas.width
  const h = c.canvas.height
  const explore = 1.4
  const maxNodes = 200
  const top = h * 0.06
  const bottom = h * 0.92

let nodes = [{ parent: -1, kids: [], visits: 0, wins: 0, depth: 0 }]
  const px = []
  const py = []

const iterate = () => {
    let i = 0
    let guard = 0
    while (nodes[i].kids.length > 0 && guard++ < 64) {
      const cur = nodes[i]
      let best = cur.kids[0]
      let bestVal = -1
      for (const ci of cur.kids) {
        const val = uct(nodes[ci].wins, nodes[ci].visits, cur.visits, explore)
        if (val > bestVal) {
          bestVal = val
          best = ci
        }
      }
      i = best
    }
    const leaf = nodes[i]
    if (leaf.visits > 0 && leaf.depth < 11 && nodes.length < maxNodes - 4) {
      const kidCount = 1 + Math.floor(Math.random() * 3)
      for (let k = 0; k < kidCount && nodes.length < maxNodes; k++) {
        leaf.kids.push(nodes.length)
        nodes.push({ parent: i, kids: [], visits: 0, wins: 0, depth: leaf.depth + 1 })
      }
      if (leaf.kids.length > 0) i = leaf.kids[Math.floor(Math.random() * leaf.kids.length)]
    }
    const reward = Math.random()
    let j = i
    let back = 0
    while (j >= 0 && back++ < 32) {
      nodes[j].visits += 1
      nodes[j].wins += reward
      j = nodes[j].parent
    }
  }

c.fillStyle = colors.paper
  c.fillRect(0, 0, w, h)

return () => {
    for (let it = 0; it < 6; it++) iterate()
    if (nodes.length >= maxNodes - 4) {
      nodes = [{ parent: -1, kids: [], visits: 0, wins: 0, depth: 0 }]
    }
    layout()

c.fillStyle = colors.paper
    c.fillRect(0, 0, w, h)
    c.strokeStyle = colors.tintAlpha(0, 0.28)
    c.lineWidth = 1
    for (let n = 1; n < nodes.length; n++) {
      const p = nodes[n].parent
      c.beginPath()
      c.moveTo(px[p], py[p])
      c.lineTo(px[n], py[n])
      c.stroke()
    }

const total = nodes[0].visits + 1
    for (let n = 0; n < nodes.length; n++) {
      const rad = Math.min(2 + Math.sqrt(nodes[n].visits / total) * 30, 20)
      c.fillStyle = nodes[n].depth === 0 ? colors.ink : colors.tint(28)
      c.beginPath()
      c.arc(px[n], py[n], rad, 0, Math.PI * 2)
      c.fill()
    }
  }
}

根が太く、枝先へ向かって細る木が育っては枝分かれを繰り返す。maxNodes に達すると根一個にリセットして育て直す。explore を 0 に近づけると勝率の高い枝へ偏って一本道に伸び、上げると枝が横に広がる。

報酬が枝によって偏ると、UCT の活用項が効いて訪問が良い側へ寄る。各枝に勝ちやすさの偏り bias を持たせ、報酬を Math.random() < 0.5 + bias の勝ち負け（1 か 0）にする。勝率を明るさに写すと、勝つ枝が明るく、かつ太く育つ。

export default (c) => {
  const w = c.canvas.width
  const h = c.canvas.height
  const explore = 1.4
  const maxNodes = 220
  const top = h * 0.06
  const bottom = h * 0.92

const make = (parent, depth, bias) => ({ parent, kids: [], visits: 0, wins: 0, depth, bias })
  let nodes = [make(-1, 0, 0)]
  const px = []
  const py = []

const iterate = () => {
    let i = 0
    let guard = 0
    while (nodes[i].kids.length > 0 && guard++ < 64) {
      const cur = nodes[i]
      let best = cur.kids[0]
      let bestVal = -1
      for (const ci of cur.kids) {
        const val = uct(nodes[ci].wins, nodes[ci].visits, cur.visits, explore)
        if (val > bestVal) {
          bestVal = val
          best = ci
        }
      }
      i = best
    }
    const leaf = nodes[i]
    if (leaf.visits > 0 && leaf.depth < 11 && nodes.length < maxNodes - 4) {
      const kidCount = 1 + Math.floor(Math.random() * 3)
      for (let k = 0; k < kidCount && nodes.length < maxNodes; k++) {
        const lean = (k / Math.max(1, kidCount - 1) - 0.5) * 0.6
        leaf.kids.push(nodes.length)
        nodes.push(make(i, leaf.depth + 1, leaf.bias + lean))
      }
      if (leaf.kids.length > 0) i = leaf.kids[Math.floor(Math.random() * leaf.kids.length)]
    }
    const reward = Math.random() < 0.5 + nodes[i].bias ? 1 : 0
    let j = i
    let back = 0
    while (j >= 0 && back++ < 32) {
      nodes[j].visits += 1
      nodes[j].wins += reward
      j = nodes[j].parent
    }
  }

c.fillStyle = colors.paper
  c.fillRect(0, 0, w, h)

return () => {
    for (let it = 0; it < 6; it++) iterate()
    if (nodes.length >= maxNodes - 4) nodes = [make(-1, 0, 0)]
    layout()

const total = nodes[0].visits + 1
    for (let n = 0; n < nodes.length; n++) {
      const rad = Math.min(2 + Math.sqrt(nodes[n].visits / total) * 30, 20)
      const rate = nodes[n].visits > 0 ? nodes[n].wins / nodes[n].visits : 0.5
      c.fillStyle = nodes[n].depth === 0 ? colors.ink : colors.tint(20 + rate * 170)
      c.beginPath()
      c.arc(px[n], py[n], rad, 0, Math.PI * 2)
      c.fill()
    }
  }
}

勝ちやすい枝ほど明るく塗られ、UCT の活用項が訪問をその側へ寄せて太く育つ。負けの濃い枝は探索項のおかげで細く残り、たまに開かれる。一度も来ない枝が消えず細く残るのは、未訪問の子の 1e9 がいずれ順番を回してくるため。ミニマックスが全部読んで一番を確定するのに対し、MCTS はランダムに薄く読んで統計で寄せる。評価関数を持たず、試行回数を増やすほど勝率の推定が締まって手が強くなる。