finish approx q learning

use dynamic dispatch for actor selection
2020-04-05 16:34:33 -04:00 · 2020-04-05 13:31:05 -04:00
5 changed files with 193 additions and 19 deletions
--- a/src/actors/mod.rs
+++ b/src/actors/mod.rs
@ -10,6 +10,7 @@ pub struct State {
    matrix: Matrix,
    active_piece: Option<Tetromino>,
    held_piece: Option<TetrominoType>,
+    line_clears: u32,
 }

 impl From<Game> for State {
@ -20,7 +21,9 @@ impl From<Game> for State {

 impl From<&Game> for State {
    fn from(game: &Game) -> Self {
-        game.playfield().clone().into()
+        let mut state: State = game.playfield().clone().into();
+        state.line_clears = game.line_clears;
+        state
    }
 }

@ -30,6 +33,7 @@ impl From<PlayField> for State {
            matrix: playfield.field().clone(),
            active_piece: playfield.active_piece,
            held_piece: playfield.hold_piece().map(|t| t.clone()),
+            line_clears: 0,
        }
    }
 }
@ -37,7 +41,14 @@ impl From<PlayField> for State {
 pub trait Actor {
    fn get_action(&self, rng: &mut SmallRng, state: &State, legal_actions: &[Action]) -> Action;

-    fn update(&mut self, state: State, action: Action, next_state: State, reward: f64);
+    fn update(
+        &mut self,
+        state: State,
+        action: Action,
+        next_state: State,
+        next_legal_actions: &[Action],
+        reward: f64,
+    );

    fn set_learning_rate(&mut self, learning_rate: f64);
    fn set_exploration_prob(&mut self, exploration_prob: f64);
--- a/src/actors/qlearning.rs
+++ b/src/actors/qlearning.rs
@ -1,5 +1,8 @@
 use crate::actors::{Actor, State};
-use crate::game::Action;
+use crate::{
+    game::Action,
+    playfield::{PLAYFIELD_HEIGHT, PLAYFIELD_WIDTH},
+};
 use log::debug;
 use rand::rngs::SmallRng;
 use rand::seq::SliceRandom;
@ -15,10 +18,10 @@ pub struct QLearningAgent {

 impl Default for QLearningAgent {
    fn default() -> Self {
-        QLearningAgent {
-            learning_rate: 0.1,
-            exploration_prob: 0.5,
-            discount_rate: 1.0,
+        Self {
+            learning_rate: 0.0,
+            exploration_prob: 0.0,
+            discount_rate: 0.0,
            q_values: HashMap::default(),
        }
    }
@ -66,7 +69,14 @@ impl Actor for QLearningAgent {
        }
    }

-    fn update(&mut self, state: State, action: Action, next_state: State, reward: f64) {
+    fn update(
+        &mut self,
+        state: State,
+        action: Action,
+        next_state: State,
+        _next_legal_actions: &[Action],
+        reward: f64,
+    ) {
        let cur_q_val = self.get_q_value(&state, action);
        let new_q_val = cur_q_val
            + self.learning_rate
@ -95,3 +105,148 @@ impl Actor for QLearningAgent {
        self.discount_rate = discount_rate;
    }
 }
+
+pub struct ApproximateQLearning {
+    pub learning_rate: f64,
+    pub exploration_prob: f64,
+    pub discount_rate: f64,
+    weights: HashMap<String, f64>,
+}
+
+impl Default for ApproximateQLearning {
+    fn default() -> Self {
+        Self {
+            learning_rate: 0.0,
+            exploration_prob: 0.0,
+            discount_rate: 0.0,
+            weights: HashMap::default(),
+        }
+    }
+}
+
+impl ApproximateQLearning {
+    fn get_features(
+        &self,
+        state: &State,
+        _action: &Action,
+        new_state: &State,
+    ) -> HashMap<String, f64> {
+        let mut features = HashMap::default();
+
+        let mut heights = [None; PLAYFIELD_WIDTH];
+        for r in 0..PLAYFIELD_HEIGHT {
+            for c in 0..PLAYFIELD_WIDTH {
+                if heights[c].is_none() && state.matrix[r][c].is_some() {
+                    heights[c] = Some(PLAYFIELD_HEIGHT - r);
+                }
+            }
+        }
+
+        features.insert(
+            "Total Height".into(),
+            heights
+                .iter()
+                .map(|o| o.unwrap_or_else(|| 0))
+                .sum::<usize>() as f64
+                / (PLAYFIELD_HEIGHT * PLAYFIELD_WIDTH) as f64,
+        );
+
+        features.insert(
+            "Bumpiness".into(),
+            heights
+                .iter()
+                .map(|o| o.unwrap_or_else(|| 0) as isize)
+                .fold((0, 0), |(acc, prev), cur| (acc + (prev - cur).abs(), cur))
+                .0 as f64
+                / (PLAYFIELD_WIDTH * 40) as f64,
+        );
+
+        features.insert(
+            "Lines cleared".into(),
+            (new_state.line_clears - state.line_clears) as f64 / 4.0,
+        );
+
+        let mut holes = 0;
+        for r in 1..PLAYFIELD_HEIGHT {
+            for c in 0..PLAYFIELD_WIDTH {
+                if state.matrix[r][c].is_none() && state.matrix[r - 1][c].is_some() {
+                    holes += 1;
+                }
+            }
+        }
+        features.insert("Holes".into(), holes as f64);
+
+        features
+    }
+
+    fn get_q_value(&self, state: &State, action: &Action, next_state: &State) -> f64 {
+        self.get_features(state, action, next_state)
+            .iter()
+            .map(|(key, val)| val * *self.weights.get(key).unwrap_or_else(|| &0.0))
+            .sum()
+    }
+
+    fn get_action_from_q_values(&self, state: &State, legal_actions: &[Action]) -> Action {
+        *legal_actions
+            .iter()
+            .map(|action| (action, self.get_q_value(&state, action, state)))
+            .max_by_key(|(_, q1)| ((q1 * 1_000_000.0) as isize))
+            .expect("Failed to select an action")
+            .0
+    }
+
+    fn get_value(&self, state: &State, legal_actions: &[Action]) -> f64 {
+        legal_actions
+            .iter()
+            .map(|action| self.get_q_value(state, action, state))
+            .max_by_key(|v| (v * 1_000_000.0) as isize)
+            .unwrap_or_else(|| 0.0)
+    }
+}
+
+impl Actor for ApproximateQLearning {
+    fn get_action(&self, rng: &mut SmallRng, state: &State, legal_actions: &[Action]) -> Action {
+        if rng.gen::<f64>() < self.exploration_prob {
+            *legal_actions.choose(rng).unwrap()
+        } else {
+            self.get_action_from_q_values(state, legal_actions)
+        }
+    }
+
+    fn update(
+        &mut self,
+        state: State,
+        action: Action,
+        next_state: State,
+        next_legal_actions: &[Action],
+        reward: f64,
+    ) {
+        let difference = reward
+            + self.discount_rate * self.get_value(&next_state, next_legal_actions)
+            - self.get_q_value(&state, &action, &next_state);
+
+        for (feat_key, feat_val) in self.get_features(&state, &action, &next_state) {
+            self.weights.insert(
+                feat_key.clone(),
+                *self.weights.get(&feat_key).unwrap_or_else(|| &0.0)
+                    + self.learning_rate * difference * feat_val,
+            );
+        }
+    }
+
+    fn set_learning_rate(&mut self, learning_rate: f64) {
+        self.learning_rate = learning_rate;
+    }
+
+    fn set_exploration_prob(&mut self, exploration_prob: f64) {
+        self.exploration_prob = exploration_prob;
+    }
+
+    fn set_discount_rate(&mut self, discount_rate: f64) {
+        self.discount_rate = discount_rate;
+    }
+
+    fn dbg(&self) {
+        dbg!(&self.weights);
+    }
+}
--- a/src/cli.rs
+++ b/src/cli.rs
@ -65,12 +65,15 @@ pub struct Train {
    /// Number of episodes to train the agent
    #[clap(short = "n", long = "num", default_value = "10")]
    pub episodes: usize,
+    // #[clap(long = "use-epsilon-decreasing")]
+    // pub epsilon_decreasing: bool,
 }

 arg_enum! {
    #[derive(Debug)]
    pub enum Agent {
-        QLearning
+        QLearning,
+        ApproximateQLearning,
    }
 }

@ -87,6 +90,9 @@ pub fn init_verbosity(opts: &Opts) -> Result<(), Box<dyn std::error::Error>> {
    Ok(())
 }

-pub fn get_actor() -> impl Actor {
-    qlearning::QLearningAgent::default()
+pub fn get_actor(agent: Agent) -> Box<dyn Actor> {
+    match agent {
+        Agent::QLearning => Box::new(qlearning::QLearningAgent::default()),
+        Agent::ApproximateQLearning => Box::new(qlearning::ApproximateQLearning::default()),
+    }
 }
--- a/src/game.rs
+++ b/src/game.rs
@ -32,7 +32,7 @@ pub struct Game {
    /// The last clear action performed, used for determining if a back-to-back
    /// bonus is needed.
    last_clear_action: ClearAction,
-    line_clears: u32,
+    pub line_clears: u32,
 }

 impl fmt::Debug for Game {
@ -178,7 +178,7 @@ impl Game {
            if cleared_lines > 0 {
                trace!("Lines were cleared.");
                self.line_clears += cleared_lines as u32;
-                self.score += (cleared_lines * self.level as usize) as u32;
+                self.score += (cleared_lines * 100 * self.level as usize) as u32;
                self.level = (self.line_clears / 10) as u8;
                self.playfield.active_piece = None;
                self.next_spawn_tick = self.tick + LINE_CLEAR_DELAY;
--- a/src/main.rs
+++ b/src/main.rs
@ -33,7 +33,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    match opts.subcmd {
        SubCommand::Play(sub_opts) => {}
        SubCommand::Train(sub_opts) => {
-            let mut to_train = get_actor();
+            let mut to_train = get_actor(sub_opts.agent);
            to_train.set_learning_rate(sub_opts.learning_rate);
            to_train.set_discount_rate(sub_opts.discount_rate);
            to_train.set_exploration_prob(sub_opts.exploration_prob);
@ -61,7 +61,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    Ok(())
 }

-fn train_actor(episodes: usize, mut actor: impl Actor) -> impl Actor {
+fn train_actor(episodes: usize, mut actor: Box<dyn Actor>) -> Box<dyn Actor> {
    let mut rng = rand::rngs::SmallRng::from_entropy();
    let mut avg = 0.0;

@ -91,14 +91,16 @@ fn train_actor(episodes: usize, mut actor: impl Actor) -> impl Actor {
            let new_state = (&game).into();
            let mut reward = game.score() as f64 - cur_score as f64;
            if action != Action::Nothing {
-                reward -= 10.0;
+                reward -= 0.0;
            }

            if game.is_game_over().is_some() {
-                reward = -100.0;
+                reward = -1.0;
            }

-            actor.update(cur_state, action, new_state, reward);
+            let new_legal_actions = game.get_legal_actions();
+
+            actor.update(cur_state, action, new_state, &new_legal_actions, reward);

            game.tick();
        }
@ -109,7 +111,7 @@ fn train_actor(episodes: usize, mut actor: impl Actor) -> impl Actor {
    actor
 }

-async fn play_game(mut actor: Option<impl Actor>) -> Result<(), Box<dyn std::error::Error>> {
+async fn play_game(mut actor: Option<Box<dyn Actor>>) -> Result<(), Box<dyn std::error::Error>> {
    let mut rng = rand::rngs::SmallRng::from_entropy();
    let sdl_context = sdl2::init()?;
    let video_subsystem = sdl_context.video()?;
Author	SHA1	Message	Date
Edward Shen	f3b48fbc85	finish approx q learning	2020-04-05 16:34:33 -04:00
Edward Shen	ce65afa277	use dynamic dispatch for actor selection	2020-04-05 13:31:05 -04:00