diff --git a/.idea/misc.xml b/.idea/misc.xml
index bc8d0a3..a59d74b 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,7 +1,14 @@
-
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 4acc2aa..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/refo.iml b/.idea/refo.iml
index 9ec6aa6..e422b6d 100644
--- a/.idea/refo.iml
+++ b/.idea/refo.iml
@@ -1,18 +1,2 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/build.gradle b/build.gradle
index d223ea5..a28b89e 100644
--- a/build.gradle
+++ b/build.gradle
@@ -1,11 +1,12 @@
plugins {
id 'java'
+ id 'org.openjfx.javafxplugin' version '0.0.8'
}
group 'net.lwenstrom.jan'
version '1.0-SNAPSHOT'
-sourceCompatibility = 1.8
+sourceCompatibility = 11
repositories {
mavenCentral()
@@ -16,3 +17,7 @@ dependencies {
compileOnly 'org.projectlombok:lombok:1.18.10'
annotationProcessor 'org.projectlombok:lombok:1.18.10'
}
+
+javafx {
+ modules = [ 'javafx.controls', 'javafx.fxml' ]
+}
diff --git a/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java b/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java
index 55c70ed..54450e8 100644
--- a/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java
+++ b/src/main/java/core/algo/MC/MonteCarloOnPolicyEGreedy.java
@@ -1,12 +1,29 @@
-package core.algo.MC;
+package core.algo.mc;
import core.*;
import core.algo.Learning;
import core.policy.EpsilonGreedyPolicy;
import javafx.util.Pair;
-
import java.util.*;
+/**
+ * TODO: Major problem:
+ * StateActionPairs are only unique accounting for their position in the episode.
+ * For example:
+ *
+ * startingState -> MOVE_LEFT : very first state action in the episode i = 1
+ * image the agent does not collect the food and drops it to the start, the agent will receive
+ * -1 for every timestamp hence (startingState -> MOVE_LEFT) will get a value of -10;
+ *
+ * BUT image moving left from the starting position will have no impact on the state because
+ * the agent ran into a wall. The known world stays the same.
+ * Taking an action after that will have the exact same state but a different action
+ * making the value of this stateActionPair -9 because the stateAction pair took place on the second
+ * timestamp, summing up all remaining rewards will be -9...
+ *
+ * How to encounter this problem?
+ * @param
+ */
public class MonteCarloOnPolicyEGreedy extends Learning {
public MonteCarloOnPolicyEGreedy(Environment environment, DiscreteActionSpace actionSpace) {
@@ -22,15 +39,17 @@ public class MonteCarloOnPolicyEGreedy extends Learning {
Map, Double> returnSum = new HashMap<>();
Map, Integer> returnCount = new HashMap<>();
+ State startingState = environment.reset();
for(int i = 0; i < nrOfEpisodes; ++i) {
-
List> episode = new ArrayList<>();
State state = environment.reset();
- for(int j=0; j < 100; ++j){
+ double rewardSum = 0;
+ for(int j=0; j < 10; ++j){
Map actionValues = stateActionTable.getActionValues(state);
A chosenAction = policy.chooseAction(actionValues);
StepResultEnvironment envResult = environment.step(chosenAction);
State nextState = envResult.getState();
+ rewardSum += envResult.getReward();
episode.add(new StepResult<>(state, chosenAction, envResult.getReward()));
if(envResult.isDone()) break;
@@ -38,23 +57,25 @@ public class MonteCarloOnPolicyEGreedy extends Learning {
state = nextState;
try {
- Thread.sleep(10);
+ Thread.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
+ System.out.printf("Episode %d \t Reward: %f \n", i, rewardSum);
Set> stateActionPairs = new HashSet<>();
for(StepResult sr: episode){
stateActionPairs.add(new Pair<>(sr.getState(), sr.getAction()));
}
-
+ System.out.println("stateActionPairs " + stateActionPairs.size());
for(Pair stateActionPair: stateActionPairs){
int firstOccurenceIndex = 0;
// find first occurance of state action pair
for(StepResult sr: episode){
if(stateActionPair.getKey().equals(sr.getState()) && stateActionPair.getValue().equals(sr.getAction())){
+;
break;
}
firstOccurenceIndex++;
diff --git a/src/main/java/core/policy/GreedyPolicy.java b/src/main/java/core/policy/GreedyPolicy.java
index 15b06f7..a727db3 100644
--- a/src/main/java/core/policy/GreedyPolicy.java
+++ b/src/main/java/core/policy/GreedyPolicy.java
@@ -5,6 +5,7 @@ import core.RNG;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
+import java.util.Random;
public class GreedyPolicy implements Policy {
@@ -17,7 +18,7 @@ public class GreedyPolicy implements Policy {
List equalHigh = new ArrayList<>();
for(Map.Entry actionValue : actionValues.entrySet()){
- System.out.println(actionValue.getKey()+ " " + actionValue.getValue() );
+ // System.out.println(actionValue.getKey() + " " + actionValue.getValue());
if(highestValueAction == null || highestValueAction < actionValue.getValue()){
highestValueAction = actionValue.getValue();
equalHigh.clear();
@@ -27,6 +28,6 @@ public class GreedyPolicy implements Policy {
}
}
- return equalHigh.get(RNG.getRandom().nextInt(equalHigh.size()));
+ return equalHigh.get(new Random().nextInt(equalHigh.size()));
}
}
diff --git a/src/main/java/evironment/antGame/AntWorld.java b/src/main/java/evironment/antGame/AntWorld.java
index ff48eb4..1d656e6 100644
--- a/src/main/java/evironment/antGame/AntWorld.java
+++ b/src/main/java/evironment/antGame/AntWorld.java
@@ -2,7 +2,7 @@ package evironment.antGame;
import core.*;
import core.algo.Learning;
-import core.algo.MC.MonteCarloOnPolicyEGreedy;
+import core.algo.mc.MonteCarloOnPolicyEGreedy;
import evironment.antGame.gui.MainFrame;
@@ -113,6 +113,7 @@ public class AntWorld implements Environment{
// than the starting point
if(currentCell.getType() != CellType.START){
reward = Reward.FOOD_DROP_DOWN_FAIL_NOT_START;
+ done = true;
}else{
reward = Reward.FOOD_DROP_DOWN_SUCCESS;
myAnt.setPoints(myAnt.getPoints() + 1);
@@ -156,10 +157,14 @@ public class AntWorld implements Environment{
done = grid.isAllFoodCollected();
}
+ if(!done){
+ reward = -1;
+ }
if(++tick == maxEpisodeTicks){
done = true;
}
+
StepResultEnvironment result = new StepResultEnvironment(newState, reward, done, info);
getGui().update(action, result);
return result;
@@ -211,6 +216,6 @@ public class AntWorld implements Environment{
new AntWorld(3, 3, 0.1),
new ListDiscreteActionSpace<>(AntAction.values())
);
- monteCarlo.learn(100,5);
+ monteCarlo.learn(20000,5);
}
}
diff --git a/src/main/java/evironment/antGame/Reward.java b/src/main/java/evironment/antGame/Reward.java
index 855c6ff..9a6926f 100644
--- a/src/main/java/evironment/antGame/Reward.java
+++ b/src/main/java/evironment/antGame/Reward.java
@@ -7,7 +7,7 @@ public class Reward {
public static final double FOOD_DROP_DOWN_FAIL_NO_FOOD = 0;
public static final double FOOD_DROP_DOWN_FAIL_NOT_START = 0;
- public static final double FOOD_DROP_DOWN_SUCCESS = 1000;
+ public static final double FOOD_DROP_DOWN_SUCCESS = 1;
public static final double UNKNOWN_FIELD_EXPLORED = 0;