From 584d6a12469c1e1455d66f1f389620fafbb3aab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20L=C3=B6wenstrom?= Date: Tue, 10 Dec 2019 15:37:20 +0100 Subject: [PATCH] add javaFX gradle plugin and switch to java11 and add system.outs for error detecting - The current implementation will not converge to the correct behaviour. See comment in MonteCarlo class for more details --- .idea/misc.xml | 9 ++++- .idea/modules.xml | 8 ----- .idea/refo.iml | 18 +--------- build.gradle | 7 +++- .../algo/MC/MonteCarloOnPolicyEGreedy.java | 33 +++++++++++++++---- src/main/java/core/policy/GreedyPolicy.java | 5 +-- .../java/evironment/antGame/AntWorld.java | 9 +++-- src/main/java/evironment/antGame/Reward.java | 2 +- 8 files changed, 53 insertions(+), 38 deletions(-) delete mode 100644 .idea/modules.xml diff --git a/.idea/misc.xml b/.idea/misc.xml index bc8d0a3..a59d74b 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,7 +1,14 @@ - + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 4acc2aa..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/refo.iml b/.idea/refo.iml index 9ec6aa6..e422b6d 100644 --- a/.idea/refo.iml +++ b/.idea/refo.iml @@ -1,18 +1,2 @@ - - - - - - - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file diff --git a/build.gradle b/build.gradle index d223ea5..a28b89e 100644 --- a/build.gradle +++ b/build.gradle @@ -1,11 +1,12 @@ plugins { id 'java' + id 'org.openjfx.javafxplugin' version '0.0.8' } group 'net.lwenstrom.jan' version '1.0-SNAPSHOT' -sourceCompatibility = 1.8 +sourceCompatibility = 11 repositories { mavenCentral() @@ -16,3 +17,7 @@ dependencies { compileOnly 'org.projectlombok:lombok:1.18.10' annotationProcessor 'org.projectlombok:lombok:1.18.10' } + +javafx { + modules = [ 'javafx.controls', 'javafx.fxml' ] +} diff --git a/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java b/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java index 55c70ed..54450e8 100644 --- a/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java +++ b/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java @@ -1,12 +1,29 @@ -package core.algo.MC; +package core.algo.mc; import core.*; import core.algo.Learning; import core.policy.EpsilonGreedyPolicy; import javafx.util.Pair; - import java.util.*; +/** + * TODO: Major problem: + * StateActionPairs are only unique accounting for their position in the episode. + * For example: + * + * startingState -> MOVE_LEFT : very first state action in the episode i = 1 + * image the agent does not collect the food and drops it to the start, the agent will receive + * -1 for every timestamp hence (startingState -> MOVE_LEFT) will get a value of -10; + * + * BUT image moving left from the starting position will have no impact on the state because + * the agent ran into a wall. The known world stays the same. + * Taking an action after that will have the exact same state but a different action + * making the value of this stateActionPair -9 because the stateAction pair took place on the second + * timestamp, summing up all remaining rewards will be -9... + * + * How to encounter this problem? + * @param + */ public class MonteCarloOnPolicyEGreedy extends Learning { public MonteCarloOnPolicyEGreedy(Environment environment, DiscreteActionSpace actionSpace) { @@ -22,15 +39,17 @@ public class MonteCarloOnPolicyEGreedy extends Learning { Map, Double> returnSum = new HashMap<>(); Map, Integer> returnCount = new HashMap<>(); + State startingState = environment.reset(); for(int i = 0; i < nrOfEpisodes; ++i) { - List> episode = new ArrayList<>(); State state = environment.reset(); - for(int j=0; j < 100; ++j){ + double rewardSum = 0; + for(int j=0; j < 10; ++j){ Map actionValues = stateActionTable.getActionValues(state); A chosenAction = policy.chooseAction(actionValues); StepResultEnvironment envResult = environment.step(chosenAction); State nextState = envResult.getState(); + rewardSum += envResult.getReward(); episode.add(new StepResult<>(state, chosenAction, envResult.getReward())); if(envResult.isDone()) break; @@ -38,23 +57,25 @@ public class MonteCarloOnPolicyEGreedy extends Learning { state = nextState; try { - Thread.sleep(10); + Thread.sleep(1); } catch (InterruptedException e) { e.printStackTrace(); } } + System.out.printf("Episode %d \t Reward: %f \n", i, rewardSum); Set> stateActionPairs = new HashSet<>(); for(StepResult sr: episode){ stateActionPairs.add(new Pair<>(sr.getState(), sr.getAction())); } - + System.out.println("stateActionPairs " + stateActionPairs.size()); for(Pair stateActionPair: stateActionPairs){ int firstOccurenceIndex = 0; // find first occurance of state action pair for(StepResult sr: episode){ if(stateActionPair.getKey().equals(sr.getState()) && stateActionPair.getValue().equals(sr.getAction())){ +; break; } firstOccurenceIndex++; diff --git a/src/main/java/core/policy/GreedyPolicy.java b/src/main/java/core/policy/GreedyPolicy.java index 15b06f7..a727db3 100644 --- a/src/main/java/core/policy/GreedyPolicy.java +++ b/src/main/java/core/policy/GreedyPolicy.java @@ -5,6 +5,7 @@ import core.RNG; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Random; public class GreedyPolicy implements Policy { @@ -17,7 +18,7 @@ public class GreedyPolicy implements Policy { List equalHigh = new ArrayList<>(); for(Map.Entry actionValue : actionValues.entrySet()){ - System.out.println(actionValue.getKey()+ " " + actionValue.getValue() ); + // System.out.println(actionValue.getKey() + " " + actionValue.getValue()); if(highestValueAction == null || highestValueAction < actionValue.getValue()){ highestValueAction = actionValue.getValue(); equalHigh.clear(); @@ -27,6 +28,6 @@ public class GreedyPolicy implements Policy { } } - return equalHigh.get(RNG.getRandom().nextInt(equalHigh.size())); + return equalHigh.get(new Random().nextInt(equalHigh.size())); } } diff --git a/src/main/java/evironment/antGame/AntWorld.java b/src/main/java/evironment/antGame/AntWorld.java index ff48eb4..1d656e6 100644 --- a/src/main/java/evironment/antGame/AntWorld.java +++ b/src/main/java/evironment/antGame/AntWorld.java @@ -2,7 +2,7 @@ package evironment.antGame; import core.*; import core.algo.Learning; -import core.algo.MC.MonteCarloOnPolicyEGreedy; +import core.algo.mc.MonteCarloOnPolicyEGreedy; import evironment.antGame.gui.MainFrame; @@ -113,6 +113,7 @@ public class AntWorld implements Environment{ // than the starting point if(currentCell.getType() != CellType.START){ reward = Reward.FOOD_DROP_DOWN_FAIL_NOT_START; + done = true; }else{ reward = Reward.FOOD_DROP_DOWN_SUCCESS; myAnt.setPoints(myAnt.getPoints() + 1); @@ -156,10 +157,14 @@ public class AntWorld implements Environment{ done = grid.isAllFoodCollected(); } + if(!done){ + reward = -1; + } if(++tick == maxEpisodeTicks){ done = true; } + StepResultEnvironment result = new StepResultEnvironment(newState, reward, done, info); getGui().update(action, result); return result; @@ -211,6 +216,6 @@ public class AntWorld implements Environment{ new AntWorld(3, 3, 0.1), new ListDiscreteActionSpace<>(AntAction.values()) ); - monteCarlo.learn(100,5); + monteCarlo.learn(20000,5); } } diff --git a/src/main/java/evironment/antGame/Reward.java b/src/main/java/evironment/antGame/Reward.java index 855c6ff..9a6926f 100644 --- a/src/main/java/evironment/antGame/Reward.java +++ b/src/main/java/evironment/antGame/Reward.java @@ -7,7 +7,7 @@ public class Reward { public static final double FOOD_DROP_DOWN_FAIL_NO_FOOD = 0; public static final double FOOD_DROP_DOWN_FAIL_NOT_START = 0; - public static final double FOOD_DROP_DOWN_SUCCESS = 1000; + public static final double FOOD_DROP_DOWN_SUCCESS = 1; public static final double UNKNOWN_FIELD_EXPLORED = 0;