This shows you the differences between two versions of the page.
| Both sides previous revision Previous revision Next revision | Previous revision | ||
|
cs501r_f2018:lab9 [2018/11/12 21:20] wingated |
cs501r_f2018:lab9 [2021/06/30 23:42] (current) |
||
|---|---|---|---|
| Line 16: | Line 16: | ||
| * 45% Proper design, creation and debugging of an actor and critic networks | * 45% Proper design, creation and debugging of an actor and critic networks | ||
| * 25% Proper implementation of the PPO loss function and objective on cart-pole ("CartPole-v0") | * 25% Proper implementation of the PPO loss function and objective on cart-pole ("CartPole-v0") | ||
| - | * 20% Implementation and demonstrated learning of PPO on another domain of your choice | + | * 20% Implementation and demonstrated learning of PPO on another domain of your choice (**except** VizDoom) |
| * 10% Visualization of policy return as a function of training | * 10% Visualization of policy return as a function of training | ||
| Line 57: | Line 57: | ||
| ---- | ---- | ||
| ====Hints and helps:==== | ====Hints and helps:==== | ||
| + | |||
| + | **Update**: Here is our | ||
| + | [[https://github.com/joshgreaves/reinforcement-learning|our lab's implementation of PPO]]. NOTE: because this code comes with a complete implementation of running on VizDoom, **you may not use that as your additional test domain.** | ||
| + | |||
| + | Here are some [[https://stackoverflow.com/questions/50667565/how-to-install-vizdoom-using-google-colab|instructions for installing vizdoom on colab]]. | ||
| + | |||
| + | |||
| + | ---- | ||
| + | |||
| + | Here is some code from our reference implementation. Hopefully it will serve as a good outline of what you need to do. | ||
| <code python> | <code python> | ||
| Line 71: | Line 81: | ||
| .... | .... | ||
| - | class AdvantageDataset(Dataset): | + | class AdvantageDataset(Dataset): |
| - | .... | + | def __init__(self, experience): |
| - | | + | super(AdvantageDataset, self).__init__() |
| - | class PolicyDataset(Dataset): | + | self._exp = experience |
| - | .... | + | self._num_runs = len(experience) |
| + | self._length = reduce(lambda acc, x: acc + len(x), experience, 0) | ||
| + | |||
| + | def __getitem__(self, index): | ||
| + | idx = 0 | ||
| + | seen_data = 0 | ||
| + | current_exp = self._exp[0] | ||
| + | while seen_data + len(current_exp) - 1 < index: | ||
| + | seen_data += len(current_exp) | ||
| + | idx += 1 | ||
| + | current_exp = self._exp[idx] | ||
| + | chosen_exp = current_exp[index - seen_data] | ||
| + | return chosen_exp[0], chosen_exp[4] | ||
| + | |||
| + | def __len__(self): | ||
| + | return self._length | ||
| + | |||
| + | | ||
| + | class PolicyDataset(Dataset): | ||
| + | def __init__(self, experience): | ||
| + | super(PolicyDataset, self).__init__() | ||
| + | self._exp = experience | ||
| + | self._num_runs = len(experience) | ||
| + | self._length = reduce(lambda acc, x: acc + len(x), experience, 0) | ||
| + | |||
| + | def __getitem__(self, index): | ||
| + | idx = 0 | ||
| + | seen_data = 0 | ||
| + | current_exp = self._exp[0] | ||
| + | while seen_data + len(current_exp) - 1 < index: | ||
| + | seen_data += len(current_exp) | ||
| + | idx += 1 | ||
| + | current_exp = self._exp[idx] | ||
| + | chosen_exp = current_exp[index - seen_data] | ||
| + | return chosen_exp | ||
| + | |||
| + | def __len__(self): | ||
| + | return self._length | ||
| def main(): | def main(): | ||
| env = gym.make('CartPole-v0') | env = gym.make('CartPole-v0') | ||
| + | policy = PolicyNetwork(4, 2) | ||
| + | value = ValueNetwork(4) | ||
| + | |||
| + | policy_optim = optim.Adam(policy.parameters(), lr=1e-2, weight_decay=0.01) | ||
| + | value_optim = optim.Adam(value.parameters(), lr=1e-3, weight_decay=1) | ||
| + | | ||
| + | # ... more stuff here... | ||
| # Hyperparameters | # Hyperparameters | ||