openAIgymNLinkContinuous.py
You can view and download this file on Github: openAIgymNLinkContinuous.py
1#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2# This is an EXUDYN example
3#
4# Details: This file shows integration with OpenAI gym by testing a inverted quadruple pendulum example
5#
6# Author: Johannes Gerstmayr
7# edited by: Peter Manzl
8# Date: 2022-05-25
9#
10# Update: 2024-06-03: now moved to stable-baselines3 version 2.3.2; updated for use with tensorboard
11# Notes: requires pip install stable-baselines3[extra]
12#
13# Copyright:This file is part of Exudyn. Exudyn is free software. You can redistribute it and/or modify it under the terms of the Exudyn license. See 'LICENSE.txt' for more details.
14#
15#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
16
17# import sys
18# sys.exudynFast = True #this variable is used to signal to load the fast exudyn module
19
20import exudyn as exu
21# from exudyn.utilities import * #includes itemInterface and rigidBodyUtilities
22from exudyn.utilities import ObjectGround, VObjectGround, RigidBodyInertia, HTtranslate, HTtranslateY, HT0,\
23 MarkerNodeCoordinate, LoadCoordinate, LoadSolutionFile
24import exudyn.graphics as graphics #only import if it does not conflict
25from exudyn.robotics import Robot, RobotLink, VRobotLink, RobotBase, VRobotBase, RobotTool, VRobotTool
26from exudyn.artificialIntelligence import OpenAIGymInterfaceEnv, spaces, logger
27#from exudyn.artificialIntelligence import *
28import math
29import numpy as np
30
31import os
32import sys
33os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
34
35import stable_baselines3
36useOldGym = tuple(map(int, stable_baselines3.__version__.split('.'))) <= tuple(map(int, '1.8.0'.split('.')))
37
38##%% here the number of links can be changed. Note that for n < 3 the actuator
39# force might need to be increased
40nLinks = 2 #number of inverted links ...
41flagContinuous = True # to choose for agent between A2C and SAC
42
43#%%++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
44#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
45#if you like to visualize the training progress in tensorboard:
46# pip install tensorboard
47# open anaconda prompt (where your according python/environment runs)
48# go to directory:
49# solution/
50# call this in anaconda prompt:
51# tensorboard --logdir=./tensorboard_log/
52# open web browser to show progress (reward, loss, ...):
53# http://localhost:6006/
54
55
56#add logger for reward:
57from stable_baselines3.common.callbacks import BaseCallback
58from stable_baselines3.common.logger import Logger
59
60hasTensorboard = False
61try:
62 import tensorboard
63 hasTensorboard = True
64 print('output written to tensorboard, start tensorboard to see progress!')
65except:
66 pass
67
68#derive class from logger to log additional information
69#this is needed for vectorized environments, where reward is not logged automatically
70class RewardLoggingCallback(BaseCallback):
71 def __init__(self, verbose=0):
72 super(RewardLoggingCallback, self).__init__(verbose)
73
74 #log mean values at rollout end
75 def _on_rollout_end(self) -> bool:
76 if 'infos' in self.locals:
77 info = self.locals['infos'][-1]
78 # print('infos:', info)
79 if 'rewardMean' in info:
80 self.logger.record("rollout/rewardMean", info['rewardMean'])
81 if 'episodeLen' in info:
82 self.logger.record("rollout/episodeLen", info['episodeLen'])
83 if 'rewardSum' in info:
84 self.logger.record("rollout/rewardSum", info['rewardSum'])
85
86 #log (possibly) every step
87 def _on_step(self) -> bool:
88 #extract local variables to find reward
89 if 'infos' in self.locals:
90 info = self.locals['infos'][-1]
91
92 if 'reward' in info:
93 self.logger.record("train/reward", info['reward'])
94 #for SAC / A2C in non-vectorized envs, per episode:
95 if 'episode' in info and 'r' in info['episode']:
96 self.logger.record("episode/reward", info['episode']['r'])
97 return True
98#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
99
100
101class InvertedNPendulumEnv(OpenAIGymInterfaceEnv):
102
103 #**classFunction: OVERRIDE this function to create multibody system mbs and setup simulationSettings; call Assemble() at the end!
104 # you may also change SC.visualizationSettings() individually; kwargs may be used for special setup
105 def CreateMBS(self, SC, mbs, simulationSettings, **kwargs):
106
107 #%%++++++++++++++++++++++++++++++++++++++++++++++
108 #this model uses kwargs: thresholdFactor
109
110 global nLinks
111 global flagContinuous
112 self.nLinks = nLinks
113 self.flagContinuous = flagContinuous
114 self.nTotalLinks = nLinks+1
115 # self.steps_beyond_done = False
116 thresholdFactor = 1
117
118 gravity = 9.81
119 self.length = 1.
120 width = 0.1*self.length
121 masscart = 1.
122 massarm = 0.1
123 total_mass = massarm + masscart
124 armInertia = self.length**2*0.5*massarm #for a rod with equally distributed mass, correctly, it would read self.length**2*massarm/3; using here the values of previous research
125
126 # environment variables and force magnitudes and are taken from the
127 # paper "Reliability evaluation of reinforcement learning methods for
128 # mechanical systems with increasing complexity", Manzl et al.
129
130 self.force_mag = 40 # 10 #10*2 works for 7e7 steps; must be larger for triple pendulum to be more reactive ...
131 if self.nLinks == 1:
132 self.force_mag = 12
133 if self.nLinks == 3:
134 self.force_mag = self.force_mag*1.5
135 thresholdFactor = 2
136 if self.nLinks >= 4:
137 self.force_mag = self.force_mag*2.5
138 thresholdFactor = 2.5
139
140 if self.flagContinuous:
141 self.force_mag *= 2 #continuous controller can have larger max value
142
143 if 'thresholdFactor' in kwargs:
144 thresholdFactor = kwargs['thresholdFactor']
145
146 self.stepUpdateTime = 0.02 # step size for RL-method
147
148 background = graphics.CheckerBoard(point= [0,0.5*self.length,-0.5*width],
149 normal= [0,0,1], size=10, size2=6, nTiles=20, nTiles2=12)
150
151 oGround=self.mbs.AddObject(ObjectGround(referencePosition= [0,0,0], #x-pos,y-pos,angle
152 visualization=VObjectGround(graphicsData= [background])))
153
154
155 #build kinematic tree with Robot class
156 L = self.length
157 w = width
158 gravity3D = [0.,-gravity,0]
159 graphicsBaseList = [graphics.Brick(size=[L*4, 0.8*w, 0.8*w], color=graphics.color.grey)] #rail
160
161 newRobot = Robot(gravity=gravity3D,
162 base = RobotBase(visualization=VRobotBase(graphicsData=graphicsBaseList)),
163 tool = RobotTool(HT=HTtranslate([0,0.5*L,0]), visualization=VRobotTool(graphicsData=[
164 graphics.Brick(size=[w, L, w], color=graphics.color.orange)])),
165 referenceConfiguration = []) #referenceConfiguration created with 0s automatically
166
167 #cart:
168 Jlink = RigidBodyInertia(masscart, np.diag([0.1*masscart,0.1*masscart,0.1*masscart]), [0,0,0])
169 link = RobotLink(Jlink.Mass(), Jlink.COM(), Jlink.InertiaCOM(),
170 jointType='Px', preHT=HT0(),
171 # PDcontrol=(pControl, dControl),
172 visualization=VRobotLink(linkColor=graphics.color.lawngreen))
173 newRobot.AddLink(link)
174
175
176
177 for i in range(self.nLinks):
178
179 Jlink = RigidBodyInertia(massarm, np.diag([armInertia,0.1*armInertia,armInertia]), [0,0.5*L,0]) #only inertia_ZZ is important
180 #Jlink = Jlink.Translated([0,0.5*L,0])
181 preHT = HT0()
182 if i > 0:
183 preHT = HTtranslateY(L)
184
185 link = RobotLink(Jlink.Mass(), Jlink.COM(), Jlink.InertiaCOM(),
186 jointType='Rz', preHT=preHT,
187 #PDcontrol=(pControl, dControl),
188 visualization=VRobotLink(linkColor=graphics.color.blue))
189 newRobot.AddLink(link)
190
191 self.Jlink = Jlink
192
193
194 sKT = []
195 dKT = newRobot.CreateKinematicTree(mbs)
196 self.oKT = dKT['objectKinematicTree']
197 self.nKT = dKT['nodeGeneric']
198
199 # sKT+=[mbs.AddSensor(SensorKinematicTree(objectNumber=oKT, linkNumber=self.nTotalLinks-1,
200 # localPosition=locPos, storeInternal=True,
201 # outputVariableType=exu.OutputVariableType.Position))]
202
203 #control force
204 mCartCoordX = self.mbs.AddMarker(MarkerNodeCoordinate(nodeNumber=self.nKT, coordinate=0))
205 self.lControl = self.mbs.AddLoad(LoadCoordinate(markerNumber=mCartCoordX, load=0.))
206
207 #%%++++++++++++++++++++++++
208 self.mbs.Assemble() #computes initial vector
209
210 self.simulationSettings.timeIntegration.numberOfSteps = 1 #this is the number of solver steps per RL-step
211 self.simulationSettings.timeIntegration.endTime = 0 #will be overwritten in step
212 self.simulationSettings.timeIntegration.verboseMode = 0
213 self.simulationSettings.solutionSettings.writeSolutionToFile = False #set True only for postprocessing
214 #self.simulationSettings.timeIntegration.simulateInRealtime = True
215
216 self.simulationSettings.timeIntegration.newton.useModifiedNewton = True
217
218 self.SC.visualizationSettings.general.drawWorldBasis=True
219 self.SC.visualizationSettings.general.graphicsUpdateInterval = 0.01
220 self.SC.visualizationSettings.openGL.multiSampling=4
221
222 #+++++++++++++++++++++++++++++++++++++++++++++++++++++
223 # Angle at which to fail the episode
224 # these parameters are used in subfunctions
225 self.theta_threshold_radians = thresholdFactor* 18 * 2 * math.pi / 360
226 self.x_threshold = thresholdFactor*3.6
227
228 #must return state size
229 stateSize = (self.nTotalLinks)*2 #the number of states (position/velocity that are used by learning algorithm)
230
231 #to track mean reward:
232 self.rewardCnt = 0
233 self.rewardMean = 0
234
235 return stateSize
236
237 #**classFunction: OVERRIDE this function to set up self.action_space and self.observation_space
238 def SetupSpaces(self):
239
240 #space is 2 times larger than space at which we get done
241 high = np.array(
242 [
243 self.x_threshold * 2,
244 ] +
245 [
246 self.theta_threshold_radians * 2,
247 ] * self.nLinks +
248 [
249 np.finfo(np.float32).max,
250 ] * self.nTotalLinks
251 ,
252 dtype=np.float32,
253 )
254
255
256 #+++++++++++++++++++++++++++++++++++++++++++++++++++++
257 #see https://github.com/openai/gym/blob/64b4b31d8245f6972b3d37270faf69b74908a67d/gym/core.py#L16
258 #for Env:
259 if flagContinuous:
260 #action is -1 ... +1, then scaled with self.force_mag
261 self.action_space = spaces.Box(low=np.array([-1 ], dtype=np.float32),
262 high=np.array([1], dtype=np.float32), dtype=np.float32)
263 else:
264 self.action_space = spaces.Discrete(2)
265
266 self.observation_space = spaces.Box(-high, high, dtype=np.float32)
267 #+++++++++++++++++++++++++++++++++++++++++++++++++++++
268
269
270 #**classFunction: this function is overwritten to map the action given by learning algorithm to the multibody system (environment)
271 def MapAction2MBS(self, action):
272 if flagContinuous:
273 force = action[0] * self.force_mag
274 else:
275 force = self.force_mag if action == 1 else -self.force_mag
276 self.mbs.SetLoadParameter(self.lControl, 'load', force)
277
278 #**classFunction: this function is overwrritten to collect output of simulation and map to self.state tuple
279 #**output: return bool done which contains information if system state is outside valid range
280 def Output2StateAndDone(self):
281
282 #+++++++++++++++++++++++++
283 statesVector = self.mbs.GetNodeOutput(self.nKT, variableType=exu.OutputVariableType.Coordinates)
284 statesVector_t = self.mbs.GetNodeOutput(self.nKT, variableType=exu.OutputVariableType.Coordinates_t)
285 self.state = tuple(list(statesVector) + list(statesVector_t)) #sorting different from previous implementation
286 cartPosX = statesVector[0]
287
288 done = bool(
289 cartPosX < -self.x_threshold
290 or cartPosX > self.x_threshold
291 or max(statesVector[1:self.nTotalLinks]) > self.theta_threshold_radians
292 or min(statesVector[1:self.nTotalLinks]) < -self.theta_threshold_radians
293 )
294 # print("cartPosX: ", cartPosX, ", done: ", done)
295 return done
296
297
298 #**classFunction: OVERRIDE this function to maps the current state to mbs initial values
299 #**output: return [initialValues, initialValues\_t] where initialValues[\_t] are ODE2 vectors of coordinates[\_t] for the mbs
300 def State2InitialValues(self):
301 #+++++++++++++++++++++++++++++++++++++++++++++
302 initialValues = self.state[0:self.nTotalLinks]
303 initialValues_t = self.state[self.nTotalLinks:]
304
305 #set initial values into mbs immediately
306 self.mbs.systemData.SetODE2Coordinates(initialValues, exu.ConfigurationType.Initial)
307 self.mbs.systemData.SetODE2Coordinates_t(initialValues_t, exu.ConfigurationType.Initial)
308
309 #this function is only called at reset(); so, we can use it to reset the mean reward:
310 self.rewardCnt = 0
311 self.rewardMean = 0
312
313 return [initialValues,initialValues_t]
314
315 #**classFunction: this is the objective which the RL method tries to maximize (average expected reward)
316 def getReward(self):
317 #reward = 1 - 0.5 * abs(self.state[0])/self.x_threshold - 0.5 * abs(self.state[1]) / self.theta_threshold_radians
318
319 reward = 1 - 0.5 * abs(self.state[0])/self.x_threshold
320 for i in range(self.nLinks):
321 reward -= 0.5 * abs(self.state[i+1]) / (self.theta_threshold_radians*self.nLinks)
322
323 if self.nLinks > 2: #larger weight on last link!
324 reward -= 5. * 0.5 * abs(self.state[self.nTotalLinks-1]) / self.theta_threshold_radians
325
326 if reward < 0: reward = 0
327
328 return reward
329
330 #**classFunction: openAI gym interface function which is called to compute one step
331 def step(self, action):
332 err_msg = f"{action!r} ({type(action)}) invalid"
333 assert self.action_space.contains(action), err_msg
334 assert self.state is not None, "Call reset before using step method."
335
336 #++++++++++++++++++++++++++++++++++++++++++++++++++
337 #main steps:
338 #initial values only set in reset function!
339 # [initialValues,initialValues_t] = self.State2InitialValues()
340 # self.mbs.systemData.SetODE2Coordinates(initialValues, exu.ConfigurationType.Initial)
341 # self.mbs.systemData.SetODE2Coordinates_t(initialValues_t, exu.ConfigurationType.Initial)
342
343 self.MapAction2MBS(action)
344
345 #this may be time consuming for larger models!
346 self.IntegrateStep()
347
348 done = self.Output2StateAndDone()
349 # print('state:', self.state, 'done: ', done)
350 #++++++++++++++++++++++++++++++++++++++++++++++++++
351 #compute reward and done
352
353 if not done:
354 reward = self.getReward()
355 elif self.steps_beyond_done is None:
356 # system just fell down
357 self.steps_beyond_done = 0
358 reward = self.getReward()
359 else:
360
361 if self.steps_beyond_done == 0:
362 logger.warn(
363 "You are calling 'step()' even though this "
364 "environment has already returned done = True. You "
365 "should always call 'reset()' once you receive 'done = "
366 "True' -- any further steps are undefined behavior."
367 )
368 self.steps_beyond_done += 1
369 reward = 0.0
370
371 self.rewardCnt += 1
372 self.rewardMean += reward
373
374 info = {'reward': reward} #put reward into info for logger
375
376 #compute mean values per episode:
377 if self.rewardCnt != 0: #per epsiode
378 info['rewardMean'] = self.rewardMean / self.rewardCnt
379 info['rewardSum'] = self.rewardMean
380 info['episodeLen'] = self.rewardCnt
381
382 terminated, truncated = done, False # since stable-baselines3 > 1.8.0 implementations terminated and truncated
383 if useOldGym:
384 return np.array(self.state, dtype=np.float32), reward, terminated, info
385 else:
386 return np.array(self.state, dtype=np.float32), reward, terminated, truncated, info
387
388
389
390
391
392
393#%%+++++++++++++++++++++++++++++++++++++++++++++
394if __name__ == '__main__': #this is only executed when file is direct called in Python
395 import time
396
397 #%%++++++++++++++++++++++++++++++++++++++++++++++++++
398 #use some learning algorithm:
399 #pip install stable_baselines3
400 from stable_baselines3 import A2C, SAC
401
402 tensorboard_log = None #no logging
403 rewardCallback = None
404 verbose = 0 #turn off to just view in tensorboard
405 if hasTensorboard: #only us if tensorboard is available
406 tensorboard_log = "solution/tensorboard_log/" #dir
407 rewardCallback = RewardLoggingCallback()
408 else:
409 verbose = 1 #turn on without tensorboard
410
411 # here the model is loaded (either for vectorized or scalar environment´using SAC or A2C).
412 def getModel(flagContinuous, myEnv, modelType='SAC'):
413
414 if flagContinuous :
415 if modelType=='SAC':
416 model = SAC('MlpPolicy',
417 env=myEnv,
418 learning_rate=8e-4,
419 device='cpu', #usually cpu is faster for this size of networks
420 batch_size=128,
421 tensorboard_log=tensorboard_log,
422 verbose=verbose)
423 elif modelType == 'A2C':
424 model = A2C('MlpPolicy',
425 myEnv,
426 device='cpu',
427 tensorboard_log=tensorboard_log,
428 verbose=verbose)
429 else:
430 print('Please specify the modelType.')
431 raise ValueError
432 else:
433 model = A2C('MlpPolicy',
434 myEnv,
435 device='cpu', #usually cpu is faster for this size of networks
436 #device='cuda', #optimal with 64 SubprocVecEnv, torch.set_num_threads(1)
437 tensorboard_log=tensorboard_log,
438 verbose=verbose)
439 return model
440
441
442 modelType = 'SAC' #use A2C or SAC
443 #create model and do reinforcement learning
444 modelName = 'openAIgym{}Link{}'.format(nLinks, 'Continuous'*flagContinuous)
445 if True: #'scalar' environment, slower:
446 env = InvertedNPendulumEnv()
447
448 #first, check if model runs:
449 if False:
450 env.TestModel(numberOfSteps=2000, seed=42, sleepTime=0.02, useRenderer=True)
451 sys.exit()
452
453 model = getModel(flagContinuous, env, modelType=modelType)
454
455 ts = -time.time()
456 model.learn(total_timesteps=int(250e3), #min 250k steps required to start having success to stabilize double pendulum
457 # progress_bar=True, #requires tqdm and rich package; set True to only see progress and set log_interval very high
458 log_interval=10, #logs per episode; influences local output and tensorboard
459 callback = rewardCallback,
460 )
461
462 print('*** learning time total =',ts+time.time(),'***')
463
464 #save learned model
465
466 model.save("solution/" + modelName)
467 else: #parallel; faster #set verbose=0 in getModel()!
468 import torch #stable-baselines3 is based on pytorch
469 n_cores= max(1,int(os.cpu_count()/2)) #n_cores should be number of real cores (not threads)
470 # n_cores = 2
471 torch.set_num_threads(n_cores) #seems to be ideal to match the size of subprocVecEnv
472
473 print('using',n_cores,'cores')
474
475 from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
476 vecEnv = SubprocVecEnv([InvertedNPendulumEnv for i in range(n_cores)])
477
478
479 #main learning task; with 20 cores 800 000 steps take in the continuous
480 # case approximatly 18 minutes (SAC), discrete (A2C) takes 2 minutes.
481 model = getModel(flagContinuous, vecEnv, modelType=modelType)
482
483 ts = -time.time()
484 print('start learning of agent with {}'.format(str(model.policy).split('(')[0]))
485
486 model.learn(total_timesteps=int(250e3), #A2C starts working above 250k; SAC similar
487 progress_bar=True, #requires tqdm and rich package; set True to only see progress and set log_interval very high (100_000_000)
488 log_interval=10, #logs per episode; influences local output and tensorboard
489 callback = rewardCallback,
490 )
491 print('*** learning time total =',ts+time.time(),'***')
492
493 #save learned model
494 model.save("solution/" + modelName)
495
496 if True:
497 #%%++++++++++++++++++++++++++++++++++++++++++++++++++
498 #only load and test
499 if False:
500 if flagContinuous and modelType == 'A2C':
501 model = A2C.load("solution/" + modelName)
502 else:
503 model = SAC.load("solution/" + modelName)
504
505 env = InvertedNPendulumEnv(thresholdFactor=5) #larger threshold for testing
506 solutionFile='solution/learningCoordinates.txt'
507 env.TestModel(numberOfSteps=1000, model=model, solutionFileName=solutionFile,
508 stopIfDone=False, useRenderer=False, sleepTime=0) #just compute solution file
509
510 #++++++++++++++++++++++++++++++++++++++++++++++
511 #visualize (and make animations) in exudyn:
512 from exudyn.interactive import SolutionViewer
513 env.SC.visualizationSettings.general.autoFitScene = False
514 solution = LoadSolutionFile(solutionFile)
515
516 SolutionViewer(env.mbs, solution, timeout=0.005, rowIncrement=2) #loads solution file via name stored in mbs