Citation

BibTex format

@inproceedings{Filippi:2010:10.1109/ALLERTON.2010.5706896,
author = {Filippi, S and Cappe, O and Garivier, A},
doi = {10.1109/ALLERTON.2010.5706896},
title = {Optimism in Reinforcement Learning and Kullback-Leibler Divergence},
url = {http://dx.doi.org/10.1109/ALLERTON.2010.5706896},
year = {2010}
}

RIS format (EndNote, RefMan)

TY  - CPAPER
AB - We consider model-based reinforcement learning in finite Markov De- cisionProcesses (MDPs), focussing on so-called optimistic strategies. In MDPs,optimism can be implemented by carrying out extended value it- erations under aconstraint of consistency with the estimated model tran- sition probabilities.The UCRL2 algorithm by Auer, Jaksch and Ortner (2009), which follows thisstrategy, has recently been shown to guarantee near-optimal regret bounds. Inthis paper, we strongly argue in favor of using the Kullback-Leibler (KL)divergence for this purpose. By studying the linear maximization problem underKL constraints, we provide an ef- ficient algorithm, termed KL-UCRL, forsolving KL-optimistic extended value iteration. Using recent deviation boundson the KL divergence, we prove that KL-UCRL provides the same guarantees asUCRL2 in terms of regret. However, numerical experiments on classicalbenchmarks show a significantly improved behavior, particularly when the MDPhas reduced connectivity. To support this observation, we provide elements ofcom- parison between the two algorithms based on geometric considerations.
AU - Filippi,S
AU - Cappe,O
AU - Garivier,A
DO - 10.1109/ALLERTON.2010.5706896
PY - 2010///
TI - Optimism in Reinforcement Learning and Kullback-Leibler Divergence
UR - http://dx.doi.org/10.1109/ALLERTON.2010.5706896
UR - http://arxiv.org/abs/1004.5229v3
ER -