-
Notifications
You must be signed in to change notification settings - Fork 64
/
ppo_trader.py
134 lines (116 loc) · 4.02 KB
/
ppo_trader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import numpy as np
from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from env.gymWrapper import create_btc_env
import os
# issue 287
LOAD_DIR = os.path.join(os.getcwd(), "model")
SAVE_DIR = os.path.join(LOAD_DIR, "ppo_agent")
# Callback function printing episode statistics
def episode_finished(r):
reward = "%.6f" % (r.episode_rewards[-1])
print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
reward=reward))
if np.mean(r.episode_rewards[-1]) > 0 :
r.agent.save_model(SAVE_DIR, append_timestep=False)
return True
def print_simple_log(r):
print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
reward=r.episode_rewards[-1]))
def create_network_spec():
network_spec = [
{
"type": "flatten"
},
dict(type='dense', size=32, activation='relu'),
dict(type='dense', size=32, activation='relu'),
dict(type='internal_lstm', size=32),
]
return network_spec
def create_baseline_spec():
baseline_spec = [
{
"type": "lstm",
"size": 32,
},
dict(type='dense', size=32, activation='relu'),
dict(type='dense', size=32, activation='relu'),
]
return baseline_spec
def main():
# create environment for train and test
PATH_TRAIN = "./data/train/"
PATH_TEST = "./data/test/"
TIMESTEP = 30 # window size
environment = create_btc_env(window_size=TIMESTEP, path=PATH_TRAIN, train=True)
test_environment = create_btc_env(window_size=TIMESTEP, path=PATH_TEST, train=False)
network_spec = create_network_spec()
baseline_spec = create_baseline_spec()
agent = PPOAgent(
discount=0.9999,
states=environment.states,
actions=environment.actions,
network=network_spec,
# Agent
states_preprocessing=None,
actions_exploration=None,
reward_preprocessing=None,
# MemoryModel
update_mode=dict(
unit= 'timesteps', #'episodes',
# 10 episodes per update
batch_size= 32,
# # Every 10 episodes
frequency=10
),
memory=dict(
type='latest',
include_next_states=False,
capacity=50000
),
# DistributionModel
distributions=None,
entropy_regularization=0.0, # None
# PGModel
baseline_mode='states',
baseline=dict(type='custom', network=baseline_spec),
baseline_optimizer=dict(
type='multi_step',
optimizer=dict(
type='adam',
learning_rate=(1e-4) # 3e-4
),
num_steps=5
),
gae_lambda=0, # 0
# PGLRModel
likelihood_ratio_clipping=0.2,
# PPOAgent
step_optimizer=dict(
type='adam',
learning_rate=(1e-4) # 1e-4
),
subsampling_fraction=0.2, # 0.1
optimization_steps=10,
execution=dict(
type='single',
session_config=None,
distributed_spec=None
)
)
train_runner = Runner(agent=agent, environment=environment)
test_runner = Runner(
agent=agent,
environment=test_environment,
)
train_runner.run(episodes=100, max_episode_timesteps=16000, episode_finished=episode_finished)
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(
ep=train_runner.episode,
ar=np.mean(train_runner.episode_rewards[-100:]))
)
test_runner.run(num_episodes=1, deterministic=True, testing=True, episode_finished=print_simple_log)
if __name__ == '__main__':
main()