openAIgymNLinkAdvanced.py
You can view and download this file on Github: openAIgymNLinkAdvanced.py
1#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2# This is an EXUDYN example
3#
4# Details: This file shows integration with OpenAI gym by testing a inverted quintuple pendulum example
5#
6# Author: Johannes Gerstmayr
7# edited by: Peter Manzl
8# Date: 2022-05-25
9#
10# Update: 2024-08-09: works now with 4 and 5 links using SAC; added truncated time and disturbance force to avoid training of many steps in straight configuration (and forgetting everything else...)
11# Notes: requires pip install stable-baselines3[extra]
12#
13# Copyright:This file is part of Exudyn. Exudyn is free software. You can redistribute it and/or modify it under the terms of the Exudyn license. See 'LICENSE.txt' for more details.
14#
15#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
16
17# import sys
18# sys.exudynFast = True #this variable is used to signal to load the fast exudyn module
19
20import exudyn as exu
21# from exudyn.utilities import * #includes itemInterface and rigidBodyUtilities
22from exudyn.utilities import ObjectGround, VObjectGround, RigidBodyInertia, HTtranslate, HTtranslateY, HT0,\
23 MarkerNodeCoordinate, LoadCoordinate, LoadSolutionFile, MarkerKinematicTreeRigid, Force
24import exudyn.graphics as graphics #only import if it does not conflict
25from exudyn.robotics import Robot, RobotLink, VRobotLink, RobotBase, VRobotBase, RobotTool, VRobotTool
26from exudyn.artificialIntelligence import OpenAIGymInterfaceEnv, spaces, logger
27#from exudyn.artificialIntelligence import *
28import math
29import numpy as np
30
31import os
32import sys
33os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
34
35import stable_baselines3
36useOldGym = tuple(map(int, stable_baselines3.__version__.split('.'))) <= tuple(map(int, '1.8.0'.split('.')))
37
38##%% here the number of links can be changed. Note that for n < 3 the actuator
39# force might need to be increased
40nLinks = 3 #number of inverted links ...
41flagContinuous = True # to choose for agent between A2C and SAC
42
43#%%++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
44#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
45#if you like to visualize the training progress in tensorboard:
46# pip install tensorboard
47# open anaconda prompt (where your according python/environment runs)
48# go to directory:
49# solution/
50# call this in anaconda prompt:
51# tensorboard --logdir=./tensorboard_log/
52# open web browser to show progress (reward, loss, ...):
53# http://localhost:6006/
54
55
56#add logger for reward:
57from stable_baselines3.common.callbacks import BaseCallback
58from stable_baselines3.common.logger import Logger
59
60best_model_text = 'invpend_best_model'
61
62hasTensorboard = False
63try:
64 import tensorboard
65 hasTensorboard = True
66 if __name__ == '__main__':
67 print('output written to tensorboard, start tensorboard to see progress!')
68except:
69 pass
70
71#derive class from logger to log additional information
72#this is needed for vectorized environments, where reward is not logged automatically
73class RewardLoggingCallback(BaseCallback):
74 def __init__(self, verbose=0, log_dir = 'solution/tensorboard_log', bestModelName=best_model_text):
75 super(RewardLoggingCallback, self).__init__(verbose)
76 self.log_dir = log_dir
77 self.save_path = os.path.join(log_dir, bestModelName).replace('tensorboard_log/','')
78 self.bestRewardSum = 0
79 self.bestRewardSumPrint = 0
80
81 #log mean values at rollout end
82 def _on_rollout_end(self) -> bool:
83 rewardSum = -1
84 if 'infos' in self.locals:
85 info = self.locals['infos'][-1]
86 # print('infos:', info)
87 if 'rewardMean' in info:
88 self.logger.record("rollout/rewardMean", info['rewardMean'])
89 if 'episodeLen' in info:
90 self.logger.record("rollout/episodeLen", info['episodeLen'])
91 if 'rewardSum' in info:
92 self.logger.record("rollout/rewardSum", info['rewardSum'])
93 rewardSum = info['rewardSum']
94
95
96 # New best model, you could save the agent here
97 if rewardSum > self.bestRewardSum:
98 self.bestRewardSum = rewardSum
99 # Example for saving best model
100 if self.verbose > 0 and rewardSum>100 and rewardSum > 1.2*self.bestRewardSumPrint:
101 self.bestRewardSumPrint = rewardSum
102 print("Saving new best model with reward sum "+str(rewardSum)+" to "+self.save_path)
103 self.model.save(self.save_path)
104
105
106 #log (possibly) every step
107 def _on_step(self) -> bool:
108 #extract local variables to find reward
109 if 'infos' in self.locals:
110 info = self.locals['infos'][-1]
111
112 if 'reward' in info:
113 self.logger.record("train/reward", info['reward'])
114 #for SAC / A2C in non-vectorized envs, per episode:
115 if 'episode' in info and 'r' in info['episode']:
116 self.logger.record("episode/reward", info['episode']['r'])
117 return True
118#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
119
120
121class InvertedNPendulumEnv(OpenAIGymInterfaceEnv):
122
123 #**classFunction: OVERRIDE this function to create multibody system mbs and setup simulationSettings; call Assemble() at the end!
124 # you may also change SC.visualizationSettings() individually; kwargs may be used for special setup
125 def CreateMBS(self, SC, mbs, simulationSettings, **kwargs):
126
127 #%%++++++++++++++++++++++++++++++++++++++++++++++
128 #this model uses kwargs: thresholdFactor
129
130 global nLinks
131 global flagContinuous
132 self.nLinks = nLinks
133 self.flagContinuous = flagContinuous
134 self.nTotalLinks = nLinks+1
135
136 #added for continuous disturbance (reset)
137 self.testModel = False #flag to show that model is evaluated
138 self.lastReset = 0 #time of last reset
139
140 # self.steps_beyond_done = False
141 thresholdFactor = 1
142
143 gravity = 9.81
144 self.length = 1.
145 width = 0.1*self.length
146 masscart = 1.
147 massarm = 0.1
148 total_mass = massarm + masscart
149 armInertia = self.length**2*massarm/12
150
151 self.randomDisturbanceForce = 0.05
152 self.maxTimeTruncate = 120 #seconds at which training is truncated
153
154 self.randomInitializationValue = 0.05 #default is 0.05, which may be too large for many links
155 # environment variables and force magnitudes and are taken from the
156 # paper "Reliability evaluation of reinforcement learning methods for
157 # mechanical systems with increasing complexity", Manzl et al.
158
159 self.force_mag = 40 # 10 #10*2 works for 7e7 steps; must be larger for triple pendulum to be more reactive ...
160 if self.nLinks == 1:
161 self.force_mag = 12
162 if self.nLinks == 3:
163 self.force_mag = self.force_mag*1.5
164 thresholdFactor = 2
165 if self.nLinks >= 4:
166 self.force_mag = self.force_mag*2.5
167 thresholdFactor = 2.5
168 if self.nLinks == 6:
169 self.force_mag = 200
170 thresholdFactor = 4
171
172 if self.flagContinuous:
173 self.force_mag *= 2 #continuous controller can have larger max value
174
175 if 'thresholdFactor' in kwargs:
176 thresholdFactor = kwargs['thresholdFactor']
177
178 self.stepUpdateTime = 0.04 # step size for RL-method; PROBLEM: small steps: no learning progress; large steps: cannot control fast enough
179 nSubSteps = 4 #for increased accuracy, definitely needed for nLinks >= 4
180 if self.nLinks >= 4:
181 # self.stepUpdateTime = 0.04 # step size for RL-method
182 # nSubSteps = 4
183 self.randomInitializationValue = 0.04 #default is 0.05, which may be too large for 4 links
184 if self.nLinks >= 5:
185 self.randomInitializationValue = 0.02
186 self.randomDisturbanceForce = 0.02 # step size for RL-method
187 # self.stepUpdateTime = 0.03 # step size for RL-method
188 # nSubSteps = 3
189 self.randomInitializationValue0 = self.randomInitializationValue
190
191 background = graphics.CheckerBoard(point= [0,0.5*self.length,-2.5*width],
192 normal= [0,0,1], size=16, size2=12, nTiles=20, nTiles2=12)
193
194 oGround=self.mbs.AddObject(ObjectGround(referencePosition= [0,0,0], #x-pos,y-pos,angle
195 visualization=VObjectGround(graphicsData= [background])))
196
197
198 #build kinematic tree with Robot class
199 L = self.length
200 w = width
201 gravity3D = [0.,-gravity,0]
202 graphicsBaseList = [graphics.Brick(size=[L*max(4,1+2*self.nLinks), 0.6*w, 0.6*w], color=graphics.color.grey)] #rail
203 graphicsListCart = [graphics.Brick(size=[L, 1.2*w, 0.8*w], color=graphics.color.orange)] #rail
204
205 #graphicsListTool = [graphics.Brick(size=[w, L, w], color=graphics.color.orange)]
206 graphicsListLink1 = [graphics.Brick(centerPoint=[0,0.5*L,0], size=[w, L, w],
207 color=graphics.color.blue),
208 graphics.Cylinder([0,0,-0.6*w],[0,0,1.2*w],radius=0.6*width,nTiles=32,
209 color=graphics.color.grey)]
210 color2 = graphics.color.red
211 graphicsListLink2 = [graphics.Brick(centerPoint=[0,0.5*L,0], size=[w, L, w],
212 color=color2),
213 graphics.Cylinder([0,0,-0.6*w],[0,0,1.2*w],radius=0.6*width,nTiles=32,
214 color=graphics.color.grey)]
215 graphicsListLinks=[graphicsListLink1,graphicsListLink2]
216
217 newRobot = Robot(gravity=gravity3D,
218 base = RobotBase(visualization=VRobotBase(graphicsData=graphicsBaseList)),
219 tool = RobotTool(HT=HTtranslate([0,0.5*L,0]),
220 #visualization=VRobotTool(graphicsData=graphicsListTool)
221 ),
222 referenceConfiguration = []) #referenceConfiguration created with 0s automatically
223
224 pControl = 140
225 dControl = 0.02*pControl
226 #cart:
227 Jlink = RigidBodyInertia(masscart, np.diag([0.1*masscart,0.1*masscart,0.1*masscart]), [0,0,0], inertiaTensorAtCOM=True)
228 link = RobotLink(Jlink.Mass(), Jlink.COM(), Jlink.InertiaCOM(),
229 jointType='Px', preHT=HT0(),
230 #PDcontrol=(pControl, dControl),
231 visualization=VRobotLink(linkColor=graphics.color.green,
232 showMBSjoint=False,
233 graphicsData = graphicsListCart))
234 newRobot.AddLink(link)
235
236
237
238 for i in range(self.nLinks):
239
240 Jlink = RigidBodyInertia(massarm, np.diag([armInertia,0.1*armInertia,armInertia]), [0,0.5*L,0],
241 inertiaTensorAtCOM=True) #only inertia_ZZ is important
242 #Jlink = Jlink.Translated([0,0.5*L,0])
243 preHT = HT0()
244 if i > 0:
245 preHT = HTtranslateY(L)
246
247 # if i==self.nLinks-1:
248 # pControl=0
249 # dControl=0
250
251 link = RobotLink(Jlink.Mass(), Jlink.COM(), Jlink.InertiaCOM(),
252 jointType='Rz', preHT=preHT,
253 #PDcontrol=(0*pControl, 0.01*dControl),
254 visualization=VRobotLink(linkColor=graphics.color.blue,
255 showMBSjoint=False,
256 graphicsData = graphicsListLinks[i%2]))
257 newRobot.AddLink(link)
258
259 self.Jlink = Jlink
260
261
262 sKT = []
263 dKT = newRobot.CreateKinematicTree(mbs)
264 self.oKT = dKT['objectKinematicTree']
265 self.nKT = dKT['nodeGeneric']
266
267 #print(mbs.GetObject(self.oKT))
268 # sKT+=[mbs.AddSensor(SensorKinematicTree(objectNumber=oKT, linkNumber=self.nTotalLinks-1,
269 # localPosition=locPos, storeInternal=True,
270 # outputVariableType=exu.OutputVariableType.Position))]
271
272 #control force
273 mCartCoordX = self.mbs.AddMarker(MarkerNodeCoordinate(nodeNumber=self.nKT, coordinate=0))
274 self.lControl = self.mbs.AddLoad(LoadCoordinate(markerNumber=mCartCoordX, load=0.))
275
276 #marker on tip
277 mTip = self.mbs.AddMarker(MarkerKinematicTreeRigid(objectNumber = self.oKT,
278 linkNumber = self.nLinks,
279 localPosition=[0,L,0]))
280 self.lTip = self.mbs.AddLoad(Force(markerNumber=mTip, loadVector=[0,0,0]))
281
282 #%%++++++++++++++++++++++++
283 self.mbs.Assemble() #computes initial vector
284
285 self.simulationSettings.timeIntegration.numberOfSteps = nSubSteps #this is the number of solver steps per RL-step
286 self.simulationSettings.timeIntegration.endTime = 0 #will be overwritten in step
287 self.simulationSettings.timeIntegration.verboseMode = 0
288 self.simulationSettings.solutionSettings.writeSolutionToFile = False #set True only for postprocessing
289 #self.simulationSettings.timeIntegration.simulateInRealtime = True
290 self.simulationSettings.solutionSettings.solutionInformation = 'inverted '+str(self.nLinks)+'-link pendulum'
291
292 self.simulationSettings.timeIntegration.newton.useModifiedNewton = True
293
294 self.SC.visualizationSettings.general.drawWorldBasis=True
295 self.SC.visualizationSettings.general.drawCoordinateSystem = False
296 self.SC.visualizationSettings.general.graphicsUpdateInterval = 0.01
297 self.SC.visualizationSettings.openGL.multiSampling=4
298 self.SC.visualizationSettings.bodies.kinematicTree.showCOMframes = False
299 SC.visualizationSettings.bodies.kinematicTree.showJointFrames = False
300 #+++++++++++++++++++++++++++++++++++++++++++++++++++++
301 # Angle at which to fail the episode
302 # these parameters are used in subfunctions
303 self.theta_threshold_radians = thresholdFactor* 18 * 2 * math.pi / 360
304 self.x_threshold = thresholdFactor*3.6
305
306 #must return state size
307 stateSize = (self.nTotalLinks)*2 #the number of states (position/velocity that are used by learning algorithm)
308
309 #to track mean reward:
310 self.rewardCnt = 0
311 self.rewardMean = 0
312
313 return stateSize
314
315 #**classFunction: OVERRIDE this function to set up self.action_space and self.observation_space
316 def SetupSpaces(self):
317
318 #space is 2 times larger than space at which we get done
319 high = np.array(
320 [
321 self.x_threshold * 2,
322 ] +
323 [
324 self.theta_threshold_radians * 2,
325 ] * self.nLinks +
326 [
327 np.finfo(np.float32).max,
328 ] * self.nTotalLinks
329 ,
330 dtype=np.float32,
331 )
332
333
334 #+++++++++++++++++++++++++++++++++++++++++++++++++++++
335 #see https://github.com/openai/gym/blob/64b4b31d8245f6972b3d37270faf69b74908a67d/gym/core.py#L16
336 #for Env:
337 if flagContinuous:
338 #action is -1 ... +1, then scaled with self.force_mag
339 self.action_space = spaces.Box(low=np.array([-1 ], dtype=np.float32),
340 high=np.array([1], dtype=np.float32), dtype=np.float32)
341 else:
342 self.action_space = spaces.Discrete(2)
343
344 self.observation_space = spaces.Box(-high, high, dtype=np.float32)
345 #+++++++++++++++++++++++++++++++++++++++++++++++++++++
346
347
348 #**classFunction: this function is overwritten to map the action given by learning algorithm to the multibody system (environment)
349 def MapAction2MBS(self, action):
350 if flagContinuous:
351 force = action[0] * self.force_mag
352 else:
353 force = self.force_mag if action == 1 else -self.force_mag
354 self.mbs.SetLoadParameter(self.lControl, 'load', force)
355
356 time = self.simulationSettings.timeIntegration.endTime
357
358 self.mbs.SetLoadParameter(self.lTip, 'loadVector', [0,0,0])
359
360
361 # if time >= 4 and time < 4.04:
362 # tipForce = 0.05
363 # self.mbs.SetLoadParameter(self.lTip, 'loadVector', [tipForce,0,0])
364 if time > 4 and np.random.randint(25*4) < 1: #randomly every 100 action steps
365 tipForce = self.randomDisturbanceForce*(np.random.rand()*2-1) #+/-0.05 N or smaller
366 if self.testModel:
367 print('time=',time,', tip force=',tipForce)
368 self.mbs.SetLoadParameter(self.lTip, 'loadVector', [tipForce,0,0])
369
370 #self.mbs.SetLoadParameter(self.lTip, 'loadVector', [5,0,0])
371
372
373 #**classFunction: this function is overwrritten to collect output of simulation and map to self.state tuple
374 #**output: return bool done which contains information if system state is outside valid range
375 def Output2StateAndDone(self):
376
377 #+++++++++++++++++++++++++
378 statesVector = self.mbs.GetNodeOutput(self.nKT, variableType=exu.OutputVariableType.Coordinates)
379 statesVector_t = self.mbs.GetNodeOutput(self.nKT, variableType=exu.OutputVariableType.Coordinates_t)
380 self.state = tuple(list(statesVector) + list(statesVector_t)) #sorting different from previous implementation
381 cartPosX = statesVector[0]
382
383 done = bool(
384 cartPosX < -self.x_threshold
385 or cartPosX > self.x_threshold
386 or max(statesVector[1:self.nTotalLinks]) > self.theta_threshold_radians
387 or min(statesVector[1:self.nTotalLinks]) < -self.theta_threshold_radians
388 )
389 # print("cartPosX: ", cartPosX, ", done: ", done)
390 return done
391
392
393 #**classFunction: OVERRIDE this function to maps the current state to mbs initial values
394 #**output: return [initialValues, initialValues\_t] where initialValues[\_t] are ODE2 vectors of coordinates[\_t] for the mbs
395 def State2InitialValues(self):
396 #+++++++++++++++++++++++++++++++++++++++++++++
397 initialValues = self.state[0:self.nTotalLinks]
398 initialValues_t = self.state[self.nTotalLinks:]
399
400 #set initial values into mbs immediately
401 self.mbs.systemData.SetODE2Coordinates(initialValues, exu.ConfigurationType.Initial)
402 self.mbs.systemData.SetODE2Coordinates_t(initialValues_t, exu.ConfigurationType.Initial)
403
404 #this function is only called at reset(); so, we can use it to reset the mean reward:
405 self.rewardCnt = 0
406 self.rewardMean = 0
407
408 return [initialValues,initialValues_t]
409
410 #**classFunction: this is the objective which the RL method tries to maximize (average expected reward)
411 def getReward(self):
412 #reward = 1 - 0.5 * abs(self.state[0])/self.x_threshold - 0.5 * abs(self.state[1]) / self.theta_threshold_radians
413
414 #avoid larger motion of base
415 reward = 1 - 0.25 * (abs(self.state[0])+(0.5*self.state[0])**2)/self.x_threshold
416
417 #more emphasis on cart for low number of links (for more links, this is less prioritized)
418 if self.nLinks == 1:
419 reward -= 2 * (abs(self.state[0]))/self.x_threshold
420
421
422 #add a penalty for each link deviation
423 for i in range(self.nLinks):
424 reward -= 0.5 * abs(self.state[i+1]) / (self.theta_threshold_radians*self.nLinks)
425
426 #more emphasis on last link:
427 if self.nLinks >= 3:
428 reward -= 0.5 * abs(self.state[self.nTotalLinks-1]) / (self.theta_threshold_radians)
429
430 #penalize velocities of top link with higher number of links
431 if self.nLinks >= 4:
432 reward -= 0.25 * abs(self.state[-1])
433
434
435 if reward < 0: reward = 0
436
437 return reward
438
439 # #after some time, we could reset state similar to a disturbance
440 # def ResetState(self):
441 # if not self.testModel:
442 # randSize = (self.stateSize)
443
444 # self.state = np.random.uniform(low=-self.randomInitializationValue,
445 # high=self.randomInitializationValue, size=randSize)
446 # initialValues = self.state[0:self.nTotalLinks]
447 # initialValues_t = self.state[self.nTotalLinks:]
448
449 # #set initial values into mbs immediately
450 # self.mbs.systemData.SetODE2Coordinates(initialValues, exu.ConfigurationType.Initial)
451 # self.mbs.systemData.SetODE2Coordinates_t(initialValues_t, exu.ConfigurationType.Initial)
452 # self.mbs.systemData.SetODE2Coordinates(initialValues, exu.ConfigurationType.Current)
453 # self.mbs.systemData.SetODE2Coordinates_t(initialValues_t, exu.ConfigurationType.Current)
454
455
456 #**classFunction: openAI gym interface function which is called to compute one step
457 def step(self, action):
458 err_msg = f"{action!r} ({type(action)}) invalid"
459 assert self.action_space.contains(action), err_msg
460 assert self.state is not None, "Call reset before using step method."
461
462 #++++++++++++++++++++++++++++++++++++++++++++++++++
463 #main steps:
464
465 self.MapAction2MBS(action)
466
467 #this may be time consuming for larger models!
468 self.IntegrateStep()
469
470 done = self.Output2StateAndDone()
471 # print('state:', self.state, 'done: ', done)
472 #++++++++++++++++++++++++++++++++++++++++++++++++++
473 #compute reward and done
474
475 if not done:
476 reward = self.getReward()
477 elif self.steps_beyond_done is None:
478 # system just fell down
479 self.steps_beyond_done = 0
480 reward = self.getReward()
481 else:
482
483 if self.steps_beyond_done == 0:
484 logger.warn(
485 "You are calling 'step()' even though this "
486 "environment has already returned done = True. You "
487 "should always call 'reset()' once you receive 'done = "
488 "True' -- any further steps are undefined behavior."
489 )
490 self.steps_beyond_done += 1
491 reward = 0.0
492
493 self.rewardCnt += 1
494 self.rewardMean += reward
495
496 info = {'reward': reward} #put reward into info for logger
497
498 #for many links: start with low value and gradually increase randomInitializationValue
499 if self.nLinks >= 6:
500 iFact = self.randomInitializationValue / self.randomInitializationValue0
501 iFactDesired = max(iFact,min(self.rewardCnt/200+0.5,4))
502 self.randomInitializationValue = self.randomInitializationValue0 * iFactDesired
503
504 #compute mean values per episode:
505 if self.rewardCnt != 0: #per epsiode
506 info['rewardMean'] = self.rewardMean / self.rewardCnt
507 info['rewardSum'] = self.rewardMean
508 info['episodeLen'] = self.rewardCnt
509
510 terminated, truncated = done, False # since stable-baselines3 > 1.8.0 implementations terminated and truncated
511 if self.simulationSettings.timeIntegration.endTime > self.maxTimeTruncate: truncated = True
512
513 if useOldGym:
514 return np.array(self.state, dtype=np.float32), reward, terminated, info
515 else:
516 return np.array(self.state, dtype=np.float32), reward, terminated, truncated, info
517
518
519
520
521
522
523#%%+++++++++++++++++++++++++++++++++++++++++++++
524if __name__ == '__main__': #this is only executed when file is direct called in Python
525 import time
526
527 #%%++++++++++++++++++++++++++++++++++++++++++++++++++
528 #use some learning algorithm:
529 #pip install stable_baselines3
530 from stable_baselines3 import A2C, SAC
531
532 tensorboard_log = None #no logging
533 rewardCallback = None
534 verbose = 0 #turn off to just view in tensorboard
535 if hasTensorboard: #only us if tensorboard is available
536 tensorboard_log = "solution/tensorboard_log/" #dir
537 rewardCallback = RewardLoggingCallback(verbose=1, bestModelName=best_model_text+str(nLinks))
538 else:
539 verbose = 0 #turn on without tensorboard
540
541 # here the model is loaded (either for vectorized or scalar environment´using SAC or A2C).
542 def getModel(flagContinuous, myEnv, modelType='SAC', learning_starts = 100,
543 learning_rate=8e-4, batch_size=128, use_sde = True, tau = 0.005, gamma = 0.99):
544
545 if flagContinuous :
546 if modelType=='SAC':
547 model = SAC('MlpPolicy',
548 env = myEnv,
549 learning_rate = learning_rate, #8e-4; default: 3e-4
550 device = 'cpu', #usually cpu is faster for this size of networks
551 batch_size = batch_size, #128; default: 256
552 use_sde = use_sde, #False; default: False
553 learning_starts = learning_starts, #default: 100
554 tensorboard_log = tensorboard_log,
555 verbose = verbose)
556 elif modelType == 'A2C':
557 model = A2C('MlpPolicy',
558 myEnv,
559 device='cpu',
560 tensorboard_log=tensorboard_log,
561 verbose=verbose)
562 else:
563 print('Please specify the modelType.')
564 raise ValueError
565 else:
566 model = A2C('MlpPolicy',
567 myEnv,
568 learning_rate=0.0007*0.5,
569 device='cpu', #usually cpu is faster for this size of networks
570 #device='cuda', #optimal with 64 SubprocVecEnv, torch.set_num_threads(1)
571 tensorboard_log=tensorboard_log,
572 verbose=verbose)
573 return model
574
575
576 modelType = 'SAC' #use A2C or SAC
577 #create model and do reinforcement learning
578 modelName = 'openAIgym{}Link{}'.format(nLinks, 'Continuous'*flagContinuous)
579 if True: #set to False, if only evaluated
580 if False: #'scalar' environment, slower:
581 env = InvertedNPendulumEnv()
582
583 #first, check if model runs:
584 if False:
585 env.TestModel(numberOfSteps=2000, seed=42, sleepTime=0.02, useRenderer=True)
586 sys.exit()
587
588 model = getModel(flagContinuous, env, modelType=modelType)
589
590 ts = -time.time()
591 model.learn(total_timesteps=int(250e0), #min 250k steps required to start having success to stabilize double pendulum
592 # progress_bar=True, #requires tqdm and rich package; set True to only see progress and set log_interval very high
593 progress_bar=True,
594 log_interval=100, #logs per episode; influences local output and tensorboard
595 callback = rewardCallback,
596 )
597
598 print('*** learning time total =',ts+time.time(),'***')
599
600 #save learned model
601
602 model.save("solution/" + modelName)
603 else: #parallel; faster #set verbose=0 in getModel()!
604 import torch #stable-baselines3 is based on pytorch
605 n_cores= 2*max(1,int(os.cpu_count()/2)) #n_cores should be number of real cores (not threads)
606 n_cores = 24 #override if you like to manually select
607 torch.set_num_threads(n_cores) #seems to be ideal to match the size of subprocVecEnv
608 print('using',n_cores,'cores')
609
610 modelCnt = ''
611
612 #perform some variations, to check if performance increases (or just repeat case)
613 variations = [
614 {},
615 #{'batch_size':256},
616 #{'tau':0.01},
617 #{},
618 ]
619
620 i = -1
621 print('performing '+str(len(variations))+' variations')
622 for d in variations:
623 i += 1
624 modelCnt = '_Trial'+str(i)
625 rewardCallback.save_path = rewardCallback.save_path.replace('_Trial'+str(i-1),'')+modelCnt
626 rewardCallback.bestRewardSum = 0
627 rewardCallback.bestRewardSumPrint = 0
628
629 from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
630 vecEnv = SubprocVecEnv([InvertedNPendulumEnv for i in range(n_cores)])
631
632
633 #main learning task; with 20 cores 800 000 steps take in the continuous
634 # case approximatly 18 minutes (SAC), discrete (A2C) takes 2 minutes.
635 model = getModel(flagContinuous, vecEnv, modelType=modelType, **d)
636
637 ts = -time.time()
638 print('start learning of agent with {}'.format(str(model.policy).split('(')[0])+', using options: '+str(d))
639 print('name='+modelName+modelCnt)
640
641 #max number of steps; NOTE: this does not always work, run several cases!
642 nSteps = 400_000
643 log_interval=100
644 if nLinks == 2: nSteps = 800_000
645 if nLinks == 3: nSteps = 1_000_000
646 if nLinks == 4: nSteps = 5_000_000
647 if nLinks == 5: nSteps = 40_000_000
648
649 if nSteps <= 1000000: log_interval=25
650
651 model.learn(total_timesteps=int(nSteps), #A2C starts working above 250k; SAC similar
652 progress_bar=True, #requires tqdm and rich package; set True to only see progress and set log_interval very high (100_000_000)
653 log_interval=log_interval, #logs per episode; influences local output and tensorboard
654 callback = rewardCallback,
655 )
656 print('*** learning time total =',ts+time.time(),'***')
657
658 #save learned model
659 model.save("solution/" + modelName+modelCnt)
660
661 if True:
662 #%%++++++++++++++++++++++++++++++++++++++++++++++++++
663 #only load and test
664 if True:
665 #here you have to select the name of the model you like to test:
666 if flagContinuous and modelType == 'A2C':
667 model = A2C.load('solution/tensorboard_log/invpend_best_model'+str(nLinks)+'_Trial0')
668 else:
669 model = SAC.load('solution/tensorboard_log/invpend_best_model'+str(nLinks)+'_Trial0')
670
671
672 env = InvertedNPendulumEnv(thresholdFactor=5) #larger threshold for testing
673 solutionFile='solution/learningCoordinates.txt'
674
675
676 #reduce disturbances for evaluation (for 1-link, you may also increase force by 10)
677 # env.randomInitializationValue *= 0.5
678 # env.randomDisturbanceForce *= 0.5
679 if nLinks == 4:
680 env.randomInitializationValue = 0.02 #smaller for evaluation
681 if nLinks == 5:
682 env.randomInitializationValue = 0.01 #smaller for evaluation
683 env.randomDisturbanceForce = 0.01 #smaller for evaluation
684
685 #env.stepUpdateTime = 0.01
686 env.testModel = True #to avoid reset
687 print('rand init=',env.randomInitializationValue)
688
689 env.TestModel(numberOfSteps=500*1, model=model, solutionFileName=solutionFile,
690 stopIfDone=False, useRenderer=False, sleepTime=0) #just compute solution file
691 #++++++++++++++++++++++++++++++++++++++++++++++
692 #visualize (and make animations) in exudyn:
693 from exudyn.interactive import SolutionViewer
694 env.SC.visualizationSettings.general.autoFitScene = False
695 env.SC.visualizationSettings.window.renderWindowSize = [1600,1200]
696 env.SC.visualizationSettings.openGL.shadow = 0.25
697 env.SC.visualizationSettings.openGL.light0position = [3,8,4,0]
698
699 solution = LoadSolutionFile(solutionFile)
700
701 SolutionViewer(env.mbs, solution, timeout=0.005, rowIncrement=5) #loads solution file via name stored in mbs