abstract: This note is a survey of various results on the capabilities of unique hard attention transformers encoders (UHATs) to recognize formal languages. We distinguish between masked vs. non-masked, finite vs. infinite image and general vs. bilinear attention score functions. We recall some relations between these models, as well as a lower bound in terms of first-order logic and an upper bound in terms of circuit complexity.


Download the bibliographic data or copy it from here:
@misc{Ryvkin-Comparison-of-Different-2025,
 abstract = {This note is a survey of various results on the capabilities of unique hard
attention transformers encoders (UHATs) to recognize formal languages. We
distinguish between masked vs. non-masked, finite vs. infinite image and
general vs. bilinear attention score functions. We recall some relations
between these models, as well as a lower bound in terms of first-order logic
and an upper bound in terms of circuit complexity.},
 archiveprefix = {arXiv},
 author = {Ryvkin, Leonid},
 eprint = {2506.03370},
 primaryclass = {cs.LG},
 title = {Comparison of Different {{Unique}} Hard Attention Transformer Models by the Formal Languages They Can Recognize},
 year = {2025}
}