/*
* Support for Intel AES-NI instructions. This file contains glue
* code, the real AES implementation is in intel-aes_asm.S.
*
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
* Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
* interface for 64-bit kernels.
* Authors: Adrian Hoban <adrian.hoban@intel.com>
* Gabriele Paoloni <gabriele.paoloni@intel.com>
* Tadeusz Struk (tadeusz.struk@intel.com)
* Aidan O'Mahony (aidan.o.mahony@intel.com)
* Copyright (c) 2010, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/hardirq.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/aes.h>
#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <crypto/b128ops.h>
#include <crypto/gcm.h>
#include <crypto/xts.h>
#include <asm/cpu_device_id.h>
#include <asm/fpu/api.h>
#include <asm/crypto/aes.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#ifdef CONFIG_X86_64
#include <asm/crypto/glue_helper.h>
#endif
#define AESNI_ALIGN 16
#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
#define RFC4106_HASH_SUBKEY_SIZE 16
#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
/* This data is stored at the end of the crypto_tfm struct.
* It's a type of per "session" data storage location.
* This needs to be 16 byte aligned.
*/
struct aesni_rfc4106_gcm_ctx {
u8 hash_subkey[16] AESNI_ALIGN_ATTR;
struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
u8 nonce[4];
};
struct generic_gcmaes_ctx {
u8 hash_subkey[16] AESNI_ALIGN_ATTR;
struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
};
struct aesni_xts_ctx {
u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
};
#define GCM_BLOCK_LEN 16
struct gcm_context_data {
/* init, update and finalize context data */
u8 aad_hash[GCM_BLOCK_LEN];
u64 aad_length;
u64 in_length;
u8 partial_block_enc_key[GCM_BLOCK_LEN];
u8 orig_IV[GCM_BLOCK_LEN];
u8 current_counter[GCM_BLOCK_LEN];
u64 partial_block_len;
u64 unused;
u8 hash_keys[GCM_BLOCK_LEN * 16];
};
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len);
asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in);
asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in);
asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len);
asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len);
asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
#ifdef CONFIG_X86_64
static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, bool enc, u8 *iv);
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
* struct gcm_context_data. May be uninitialized.
* u8 *out, Ciphertext output. Encrypt in-place is allowed.
* const u8 *in, Plaintext input
* unsigned long plaintext_len, Length of data in bytes for encryption.
* u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
* 16-byte aligned pointer.
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
* const u8 *aad, Additional Authentication Data (AAD)
* unsigned long aad_len, Length of AAD in bytes.
* u8 *auth_tag, Authenticated Tag output.
* unsigned long auth_tag_len), Authenticated Tag Length in bytes.
* Valid values are 16 (most likely), 12 or 8.
*/
asmlinkage void aesni_gcm_enc(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
/* asmlinkage void aesni_gcm_dec()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
* struct gcm_context_data. May be uninitialized.
* u8 *out, Plaintext output. Decrypt in-place is allowed.
* const u8 *in, Ciphertext input
* unsigned long ciphertext_len, Length of data in bytes for decryption.
* u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
* 16-byte aligned pointer.
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
* const u8 *aad, Additional Authentication Data (AAD)
* unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
* to be 8 or 12 bytes
* u8 *auth_tag, Authenticated Tag output.
* unsigned long auth_tag_len) Authenticated Tag Length in bytes.
* Valid values are 16 (most likely), 12 or 8.
*/
asmlinkage void aesni_gcm_dec(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
/* Scatter / Gather routines, with args similar to above */
asmlinkage void aesni_gcm_init(void *ctx,
struct gcm_context_data *gdata,
u8 *iv,
u8 *hash_subkey, const u8 *aad,
unsigned long aad_len);
asmlinkage void aesni_gcm_enc_update(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len);
asmlinkage void aesni_gcm_dec_update(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long ciphertext_len);
asmlinkage void aesni_gcm_finalize(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
static struct aesni_gcm_tfm_s {
void (*init)(void *ctx,
struct gcm_context_data *gdata,
u8 *iv,
u8 *hash_subkey, const u8 *aad,
unsigned long aad_len);
void (*enc_update)(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long plaintext_len);
void (*dec_update)(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long ciphertext_len);
void (*finalize)(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
} *aesni_gcm_tfm;
struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
.init = &aesni_gcm_init,
.enc_update = &aesni_gcm_enc_update,
.dec_update = &aesni_gcm_dec_update,
.finalize = &aesni_gcm_finalize,
};
#ifdef CONFIG_AS_AVX
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
/*
* asmlinkage void aesni_gcm_init_avx_gen2()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
struct gcm_context_data *gdata,
u8 *iv,
u8 *hash_subkey,
const u8 *aad,
unsigned long aad_len);
asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len);
asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long ciphertext_len);
asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
.init = &aesni_gcm_init_avx_gen2,
.enc_update = &aesni_gcm_enc_update_avx_gen2,
.dec_update = &aesni_gcm_dec_update_avx_gen2,
.finalize = &aesni_gcm_finalize_avx_gen2,
};
#endif
#ifdef CONFIG_AS_AVX2
/*
* asmlinkage void aesni_gcm_init_avx_gen4()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
struct gcm_context_data *gdata,
u8 *iv,
u8 *hash_subkey,
const u8 *aad,
unsigned long aad_len);
asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len);
asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long ciphertext_len);
asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
.init = &aesni_gcm_init_avx_gen4,
.enc_update = &aesni_gcm_enc_update_avx_gen4,
.dec_update = &aesni_gcm_dec_update_avx_gen4,
.finalize = &aesni_gcm_finalize_avx_gen4,
};
#endif
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{
unsigned long align = AESNI_ALIGN;
if (align <= crypto_tfm_ctx_alignment())
align = 1;
return PTR_ALIGN(crypto_aead_ctx(tfm), align);
}
static inline struct
generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
{
unsigned long align = AESNI_ALIGN;
if (align <= crypto_tfm_ctx_alignment())
align = 1;
return PTR_ALIGN(crypto_aead_ctx(tfm), align);
}
#endif
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
unsigned long addr = (unsigned long)raw_ctx; /*covered*/
unsigned long align = AESNI_ALIGN;
if (align <= crypto_tfm_ctx_alignment())
align = 1;
return (struct crypto_aes_ctx *)ALIGN(addr, align);
}
static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
const u8 *in_key, unsigned int key_len)
{
struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx); /*covered*/
u32 *flags = &tfm->crt_flags;
int err;
if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
key_len != AES_KEYSIZE_256) {
*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
if (!irq_fpu_usable()) /*covered*/
err = crypto_aes_expand_key(ctx, in_key, key_len);
else {
kernel_fpu_begin(); /*covered*/
err = aesni_set_key(ctx, in_key, key_len);
kernel_fpu_end(); /*covered*/
}
return err;
}
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len); /*covered*/
}
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
if (!irq_fpu_usable())
crypto_aes_encrypt_x86(ctx, dst, src);
else {
kernel_fpu_begin();
aesni_enc(ctx, dst, src);
kernel_fpu_end();
}
}
static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
if (!irq_fpu_usable())
crypto_aes_decrypt_x86(ctx, dst, src);
else {
kernel_fpu_begin();
aesni_dec(ctx, dst, src);
kernel_fpu_end();
}
}
static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
aesni_enc(ctx, dst, src);
}
static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
aesni_dec(ctx, dst, src);
}
static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int len)
{
return aes_set_key_common(crypto_skcipher_tfm(tfm),
crypto_skcipher_ctx(tfm), key, len);
}
static int ecb_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
return err;
}
static int ecb_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
return err;
}
static int cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
return err;
}
static int cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
return err;
}
#ifdef CONFIG_X86_64
static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
struct skcipher_walk *walk)
{
u8 *ctrblk = walk->iv;
u8 keystream[AES_BLOCK_SIZE];
u8 *src = walk->src.virt.addr;
u8 *dst = walk->dst.virt.addr;
unsigned int nbytes = walk->nbytes;
aesni_enc(ctx, keystream, ctrblk);
crypto_xor_cpy(dst, keystream, src, nbytes);
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}
#ifdef CONFIG_AS_AVX
static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv)
{
/*
* based on key length, override with the by8 version
* of ctr mode encryption/decryption for improved performance
* aes_set_key_common() ensures that key length is one of
* {128,192,256}
*/
if (ctx->key_length == AES_KEYSIZE_128)
aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
else if (ctx->key_length == AES_KEYSIZE_192)
aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
else
aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
}
#endif
static int ctr_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
if (walk.nbytes) {
ctr_crypt_final(ctx, &walk);
err = skcipher_walk_done(&walk, 0);
}
kernel_fpu_end();
return err;
}
static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
keylen /= 2;
/* first half of xts-key is for crypt */
err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
key, keylen);
if (err)
return err;
/* second half of xts-key is for tweak */
return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
key + keylen, keylen);
}
static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
{
aesni_enc(ctx, out, in);
}
static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
}
static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
}
static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
}
static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
}
static const struct common_glue_ctx aesni_enc_xts = {
.num_funcs = 2,
.fpu_blocks_limit = 1,
.funcs = { {
.num_blocks = 8,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
}, {
.num_blocks = 1,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
} }
};
static const struct common_glue_ctx aesni_dec_xts = {
.num_funcs = 2,
.fpu_blocks_limit = 1,
.funcs = { {
.num_blocks = 8,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
}, {
.num_blocks = 1,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
} }
};
static int xts_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
return glue_xts_req_128bit(&aesni_enc_xts, req,
XTS_TWEAK_CAST(aesni_xts_tweak),
aes_ctx(ctx->raw_tweak_ctx),
aes_ctx(ctx->raw_crypt_ctx));
}
static int xts_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
return glue_xts_req_128bit(&aesni_dec_xts, req,
XTS_TWEAK_CAST(aesni_xts_tweak),
aes_ctx(ctx->raw_tweak_ctx),
aes_ctx(ctx->raw_crypt_ctx));
}
static int rfc4106_init(struct crypto_aead *aead)
{
struct cryptd_aead *cryptd_tfm;
struct cryptd_aead **ctx = crypto_aead_ctx(aead);
cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
CRYPTO_ALG_INTERNAL,
CRYPTO_ALG_INTERNAL);
if (IS_ERR(cryptd_tfm))
return PTR_ERR(cryptd_tfm);
*ctx = cryptd_tfm;
crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
return 0;
}
static void rfc4106_exit(struct crypto_aead *aead)
{
struct cryptd_aead **ctx = crypto_aead_ctx(aead);
cryptd_free_aead(*ctx);
}
static int
rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
{
struct crypto_cipher *tfm;
int ret;
tfm = crypto_alloc_cipher("aes", 0, 0);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
ret = crypto_cipher_setkey(tfm, key, key_len);
if (ret)
goto out_free_cipher;
/* Clear the data in the hash sub key container to zero.*/
/* We want to cipher all zeros to create the hash sub key. */
memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey);
out_free_cipher:
crypto_free_cipher(tfm);
return ret;
}
static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
unsigned int key_len)
{
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
if (key_len < 4) {
crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
/*Account for 4 byte nonce at the end.*/
key_len -= 4;
memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
return aes_set_key_common(crypto_aead_tfm(aead),
&ctx->aes_key_expanded, key, key_len) ?:
rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
}
static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key,
unsigned int key_len)
{
struct cryptd_aead **ctx = crypto_aead_ctx(parent);
struct cryptd_aead *cryptd_tfm = *ctx;
return crypto_aead_setkey(&cryptd_tfm->base, key, key_len);
}
static int common_rfc4106_set_authsize(struct crypto_aead *aead,
unsigned int authsize)
{
switch (authsize) {
case 8:
case 12:
case 16:
break;
default:
return -EINVAL;
}
return 0;
}
/* This is the Integrity Check Value (aka the authentication tag length and can
* be 8, 12 or 16 bytes long. */
static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent,
unsigned int authsize)
{
struct cryptd_aead **ctx = crypto_aead_ctx(parent);
struct cryptd_aead *cryptd_tfm = *ctx;
return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
}
static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
unsigned int authsize)
{
switch (authsize) {
case 4:
case 8:
case 12:
case 13:
case 14:
case 15:
case 16:
break;
default:
return -EINVAL;
}
return 0;
}
static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
unsigned int assoclen, u8 *hash_subkey,
u8 *iv, void *aes_ctx)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
struct gcm_context_data data AESNI_ALIGN_ATTR;
struct scatter_walk dst_sg_walk = {};
unsigned long left = req->cryptlen;
unsigned long len, srclen, dstlen;
struct scatter_walk assoc_sg_walk;
struct scatter_walk src_sg_walk;
struct scatterlist src_start[2];
struct scatterlist dst_start[2];
struct scatterlist *src_sg;
struct scatterlist *dst_sg;
u8 *src, *dst, *assoc;
u8 *assocmem = NULL;
u8 authTag[16];
if (!enc)
left -= auth_tag_len;
#ifdef CONFIG_AS_AVX2
if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
gcm_tfm = &aesni_gcm_tfm_avx_gen2;
#endif
#ifdef CONFIG_AS_AVX
if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
gcm_tfm = &aesni_gcm_tfm_sse;
#endif
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length &&
(!PageHighMem(sg_page(req->src)) ||
req->src->offset + req->src->length <= PAGE_SIZE)) {
scatterwalk_start(&assoc_sg_walk, req->src);
assoc = scatterwalk_map(&assoc_sg_walk);
} else {
/* assoc can be any length, so must be on heap */
assocmem = kmalloc(assoclen, GFP_ATOMIC);
if (unlikely(!assocmem))
return -ENOMEM;
assoc = assocmem;
scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
}
src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
scatterwalk_start(&src_sg_walk, src_sg);
if (req->src != req->dst) {
dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen);
scatterwalk_start(&dst_sg_walk, dst_sg);
}
kernel_fpu_begin();
gcm_tfm->init(aes_ctx, &data, iv,
hash_subkey, assoc, assoclen);
if (req->src != req->dst) {
while (left) {
src = scatterwalk_map(&src_sg_walk);
dst = scatterwalk_map(&dst_sg_walk);
srclen = scatterwalk_clamp(&src_sg_walk, left);
dstlen = scatterwalk_clamp(&dst_sg_walk, left);
len = min(srclen, dstlen);
if (len) {
if (enc)
gcm_tfm->enc_update(aes_ctx, &data,
dst, src, len);
else
gcm_tfm->dec_update(aes_ctx, &data,
dst, src, len);
}
left -= len;
scatterwalk_unmap(src);
scatterwalk_unmap(dst);
scatterwalk_advance(&src_sg_walk, len);
scatterwalk_advance(&dst_sg_walk, len);
scatterwalk_done(&src_sg_walk, 0, left);
scatterwalk_done(&dst_sg_walk, 1, left);
}
} else {
while (left) {
dst = src = scatterwalk_map(&src_sg_walk);
len = scatterwalk_clamp(&src_sg_walk, left);
if (len) {
if (enc)
gcm_tfm->enc_update(aes_ctx, &data,
src, src, len);
else
gcm_tfm->dec_update(aes_ctx, &data,
src, src, len);
}
left -= len;
scatterwalk_unmap(src);
scatterwalk_advance(&src_sg_walk, len);
scatterwalk_done(&src_sg_walk, 1, left);
}
}
gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
kernel_fpu_end();
if (!assocmem)
scatterwalk_unmap(assoc);
else
kfree(assocmem);
if (!enc) {
u8 authTagMsg[16];
/* Copy out original authTag */
scatterwalk_map_and_copy(authTagMsg, req->src,
req->assoclen + req->cryptlen -
auth_tag_len,
auth_tag_len, 0);
/* Compare generated tag with passed in tag. */
return crypto_memneq(authTagMsg, authTag, auth_tag_len) ?
-EBADMSG : 0;
}
/* Copy in the authTag */
scatterwalk_map_and_copy(authTag, req->dst,
req->assoclen + req->cryptlen,
auth_tag_len, 1);
return 0;
}
static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
aes_ctx);
}
static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
aes_ctx);
}
static int helper_rfc4106_encrypt(struct aead_request *req)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
unsigned int i;
__be32 counter = cpu_to_be32(1);
/* Assuming we are supporting rfc4106 64-bit extended */
/* sequence numbers We need to have the AAD length equal */
/* to 16 or 20 bytes */
if (unlikely(req->assoclen != 16 && req->assoclen != 20))
return -EINVAL;
/* IV below built */
for (i = 0; i < 4; i++)
*(iv+i) = ctx->nonce[i];
for (i = 0; i < 8; i++)
*(iv+4+i) = req->iv[i];
*((__be32 *)(iv+12)) = counter;
return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
aes_ctx);
}
static int helper_rfc4106_decrypt(struct aead_request *req)
{
__be32 counter = cpu_to_be32(1);
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
unsigned int i;
if (unlikely(req->assoclen != 16 && req->assoclen != 20))
return -EINVAL;
/* Assuming we are supporting rfc4106 64-bit extended */
/* sequence numbers We need to have the AAD length */
/* equal to 16 or 20 bytes */
/* IV below built */
for (i = 0; i < 4; i++)
*(iv+i) = ctx->nonce[i];
for (i = 0; i < 8; i++)
*(iv+4+i) = req->iv[i];
*((__be32 *)(iv+12)) = counter;
return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
aes_ctx);
}
static int gcmaes_wrapper_encrypt(struct aead_request *req)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
struct cryptd_aead *cryptd_tfm = *ctx;
tfm = &cryptd_tfm->base;
if (irq_fpu_usable() && (!in_atomic() ||
!cryptd_aead_queued(cryptd_tfm)))
tfm = cryptd_aead_child(cryptd_tfm);
aead_request_set_tfm(req, tfm);
return crypto_aead_encrypt(req);
}
static int gcmaes_wrapper_decrypt(struct aead_request *req)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
struct cryptd_aead *cryptd_tfm = *ctx;
tfm = &cryptd_tfm->base;
if (irq_fpu_usable() && (!in_atomic() ||
!cryptd_aead_queued(cryptd_tfm)))
tfm = cryptd_aead_child(cryptd_tfm);
aead_request_set_tfm(req, tfm);
return crypto_aead_decrypt(req);
}
#endif
static struct crypto_alg aesni_algs[] = { {
.cra_name = "aes",
.cra_driver_name = "aes-aesni",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = aes_set_key,
.cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt
}
}
}, {
.cra_name = "__aes",
.cra_driver_name = "__aes-aesni",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = aes_set_key,
.cia_encrypt = __aes_encrypt,
.cia_decrypt = __aes_decrypt
}
}
} };
static struct skcipher_alg aesni_skciphers[] = {
{
.base = {
.cra_name = "__ecb(aes)",
.cra_driver_name = "__ecb-aes-aesni",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.setkey = aesni_skcipher_setkey,
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
.base = {
.cra_name = "__cbc(aes)",
.cra_driver_name = "__cbc-aes-aesni",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesni_skcipher_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
#ifdef CONFIG_X86_64
}, {
.base = {
.cra_name = "__ctr(aes)",
.cra_driver_name = "__ctr-aes-aesni",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.setkey = aesni_skcipher_setkey,
.encrypt = ctr_crypt,
.decrypt = ctr_crypt,
}, {
.base = {
.cra_name = "__xts(aes)",
.cra_driver_name = "__xts-aes-aesni",
.cra_priority = 401,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = XTS_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
},
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = xts_aesni_setkey,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
#endif
}
};
static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
unsigned int key_len)
{
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
return aes_set_key_common(crypto_aead_tfm(aead),
&ctx->aes_key_expanded, key, key_len) ?:
rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
}
static int generic_gcmaes_encrypt(struct aead_request *req)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
__be32 counter = cpu_to_be32(1);
memcpy(iv, req->iv, 12);
*((__be32 *)(iv+12)) = counter;
return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv,
aes_ctx);
}
static int generic_gcmaes_decrypt(struct aead_request *req)
{
__be32 counter = cpu_to_be32(1);
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
memcpy(iv, req->iv, 12);
*((__be32 *)(iv+12)) = counter;
return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv,
aes_ctx);
}
static int generic_gcmaes_init(struct crypto_aead *aead)
{
struct cryptd_aead *cryptd_tfm;
struct cryptd_aead **ctx = crypto_aead_ctx(aead);
cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni",
CRYPTO_ALG_INTERNAL,
CRYPTO_ALG_INTERNAL);
if (IS_ERR(cryptd_tfm))
return PTR_ERR(cryptd_tfm);
*ctx = cryptd_tfm;
crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
return 0;
}
static void generic_gcmaes_exit(struct crypto_aead *aead)
{
struct cryptd_aead **ctx = crypto_aead_ctx(aead);
cryptd_free_aead(*ctx);
}
static struct aead_alg aesni_aead_algs[] = { {
.setkey = common_rfc4106_set_key,
.setauthsize = common_rfc4106_set_authsize,
.encrypt = helper_rfc4106_encrypt,
.decrypt = helper_rfc4106_decrypt,
.ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "__gcm-aes-aesni",
.cra_driver_name = "__driver-gcm-aes-aesni",
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
.cra_alignmask = AESNI_ALIGN - 1,
.cra_module = THIS_MODULE,
},
}, {
.init = rfc4106_init,
.exit = rfc4106_exit,
.setkey = gcmaes_wrapper_set_key,
.setauthsize = gcmaes_wrapper_set_authsize,
.encrypt = gcmaes_wrapper_encrypt,
.decrypt = gcmaes_wrapper_decrypt,
.ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "rfc4106(gcm(aes))",
.cra_driver_name = "rfc4106-gcm-aesni",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct cryptd_aead *),
.cra_module = THIS_MODULE,
},
}, {
.setkey = generic_gcmaes_set_key,
.setauthsize = generic_gcmaes_set_authsize,
.encrypt = generic_gcmaes_encrypt,
.decrypt = generic_gcmaes_decrypt,
.ivsize = GCM_AES_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "__generic-gcm-aes-aesni",
.cra_driver_name = "__driver-generic-gcm-aes-aesni",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
.cra_alignmask = AESNI_ALIGN - 1,
.cra_module = THIS_MODULE,
},
}, {
.init = generic_gcmaes_init,
.exit = generic_gcmaes_exit,
.setkey = gcmaes_wrapper_set_key,
.setauthsize = gcmaes_wrapper_set_authsize,
.encrypt = gcmaes_wrapper_encrypt,
.decrypt = gcmaes_wrapper_decrypt,
.ivsize = GCM_AES_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "gcm(aes)",
.cra_driver_name = "generic-gcm-aesni",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct cryptd_aead *),
.cra_module = THIS_MODULE,
},
} };
#else
static struct aead_alg aesni_aead_algs[0];
#endif
static const struct x86_cpu_id aesni_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_AES),
{}
};
MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
static void aesni_free_simds(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) &&
aesni_simd_skciphers[i]; i++)
simd_skcipher_free(aesni_simd_skciphers[i]);
}
static int __init aesni_init(void)
{
struct simd_skcipher_alg *simd;
const char *basename;
const char *algname;
const char *drvname;
int err;
int i;
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
#ifdef CONFIG_AS_AVX2
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
} else
#endif
#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
} else
#endif
{
pr_info("SSE version of gcm_enc/dec engaged.\n");
aesni_gcm_tfm = &aesni_gcm_tfm_sse;
}
aesni_ctr_enc_tfm = aesni_ctr_enc;
#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
/* optimize performance of ctr mode encryption transform */
aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
pr_info("AES CTR mode by8 optimization enabled\n");
}
#endif
#endif
err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
if (err)
return err;
err = crypto_register_skciphers(aesni_skciphers,
ARRAY_SIZE(aesni_skciphers));
if (err)
goto unregister_algs;
err = crypto_register_aeads(aesni_aead_algs,
ARRAY_SIZE(aesni_aead_algs));
if (err)
goto unregister_skciphers;
for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) {
algname = aesni_skciphers[i].base.cra_name + 2;
drvname = aesni_skciphers[i].base.cra_driver_name + 2;
basename = aesni_skciphers[i].base.cra_driver_name;
simd = simd_skcipher_create_compat(algname, drvname, basename);
err = PTR_ERR(simd);
if (IS_ERR(simd))
goto unregister_simds;
aesni_simd_skciphers[i] = simd;
}
return 0;
unregister_simds:
aesni_free_simds();
crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
unregister_skciphers:
crypto_unregister_skciphers(aesni_skciphers,
ARRAY_SIZE(aesni_skciphers));
unregister_algs:
crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
return err;
}
static void __exit aesni_exit(void)
{
aesni_free_simds();
crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
crypto_unregister_skciphers(aesni_skciphers,
ARRAY_SIZE(aesni_skciphers));
crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
}
late_initcall(aesni_init);
module_exit(aesni_exit);
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("aes");
/*
* Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
* CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
* CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
* http://www.intel.com/products/processor/manuals/
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
* Volume 2A: Instruction Set Reference, A-M
*
* Copyright (C) 2008 Intel Corporation
* Authors: Austin Zhang <austin_zhang@linux.intel.com>
* Kent Liu <kent.liu@intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <crypto/internal/hash.h>
#include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h>
#include <asm/fpu/internal.h>
#define CHKSUM_BLOCK_SIZE 1
#define CHKSUM_DIGEST_SIZE 4
#define SCALE_F sizeof(unsigned long)
#ifdef CONFIG_X86_64
#define REX_PRE "0x48, "
#else
#define REX_PRE
#endif
#ifdef CONFIG_X86_64
/*
* use carryless multiply version of crc32c when buffer
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
__asm__ __volatile__(
".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
:"=S"(crc)
:"0"(crc), "c"(*data) /*covered*/
);
data++;
}
return crc;
}
static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
{
unsigned int iquotient = len / SCALE_F;
unsigned int iremainder = len % SCALE_F;
unsigned long *ptmp = (unsigned long *)p;
while (iquotient--) {
__asm__ __volatile__(
".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
:"=S"(crc)
:"0"(crc), "c"(*ptmp) /*covered*/
);
ptmp++;
}
if (iremainder) /*covered*/
crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, /*covered*/
iremainder);
return crc;
}
/*
* Setting the seed allows arbitrary accumulators and flexible XOR policy
* If your algorithm starts with ~0, then XOR with ~0 before you set
* the seed.
*/
static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
unsigned int keylen)
{
u32 *mctx = crypto_shash_ctx(hash);
if (keylen != sizeof(u32)) {
crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
static int crc32c_intel_init(struct shash_desc *desc)
{
u32 *mctx = crypto_shash_ctx(desc->tfm);
u32 *crcp = shash_desc_ctx(desc);
*crcp = *mctx;
return 0;
}
static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
u32 *crcp = shash_desc_ctx(desc);
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
*(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
}
static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
}
static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
{
u32 *crcp = shash_desc_ctx(desc);
*(__le32 *)out = ~cpu_to_le32p(crcp);
return 0;
}
static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
out);
}
static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
{
u32 *key = crypto_tfm_ctx(tfm);
*key = ~0;
return 0;
}
#ifdef CONFIG_X86_64
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, /*covered*/
unsigned int len)
{
u32 *crcp = shash_desc_ctx(desc);
/*
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { /*covered*/
kernel_fpu_begin(); /*covered*/
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len); /*covered*/
return 0; /*covered*/
}
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
}
static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
}
static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
out);
}
#endif /* CONFIG_X86_64 */
static struct shash_alg alg = {
.setkey = crc32c_intel_setkey,
.init = crc32c_intel_init,
.update = crc32c_intel_update,
.final = crc32c_intel_final,
.finup = crc32c_intel_finup,
.digest = crc32c_intel_digest,
.descsize = sizeof(u32),
.digestsize = CHKSUM_DIGEST_SIZE,
.base = {
.cra_name = "crc32c",
.cra_driver_name = "crc32c-intel",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.cra_blocksize = CHKSUM_BLOCK_SIZE,
.cra_ctxsize = sizeof(u32),
.cra_module = THIS_MODULE,
.cra_init = crc32c_intel_cra_init,
}
};
static const struct x86_cpu_id crc32c_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
{}
};
MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
static int __init crc32c_intel_mod_init(void)
{
if (!x86_match_cpu(crc32c_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
}
#endif
return crypto_register_shash(&alg);
}
static void __exit crc32c_intel_mod_fini(void)
{
crypto_unregister_shash(&alg);
}
module_init(crc32c_intel_mod_init);
module_exit(crc32c_intel_mod_fini);
MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("crc32c");
MODULE_ALIAS_CRYPTO("crc32c-intel");
/*
* Cryptographic API.
*
* Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
* Supplemental SSE3 instructions.
*
* This file is based on sha1_generic.c
*
* Copyright (c) Alan Smithee.
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
* Copyright (c) Mathias Krause <minipli@googlemail.com>
* Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/cryptohash.h>
#include <linux/types.h>
#include <crypto/sha.h>
#include <crypto/sha1_base.h>
#include <asm/fpu/api.h>
typedef void (sha1_transform_fn)(u32 *digest, const char *data,
unsigned int rounds);
static int sha1_update(struct shash_desc *desc, const u8 *data, /*covered*/
unsigned int len, sha1_transform_fn *sha1_xform)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
if (!irq_fpu_usable() || /*covered*/
(sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) /*covered*/
return crypto_sha1_update(desc, data, len); /*covered*/
/* make sure casting to sha1_block_fn() is safe */
BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
kernel_fpu_begin(); /*covered*/
sha1_base_do_update(desc, data, len, /*covered*/
(sha1_block_fn *)sha1_xform);
kernel_fpu_end(); /*covered*/
return 0;
}
static int sha1_finup(struct shash_desc *desc, const u8 *data, /*covered*/
unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
{
if (!irq_fpu_usable()) /*covered*/
return crypto_sha1_finup(desc, data, len, out);
kernel_fpu_begin(); /*covered*/
if (len)
sha1_base_do_update(desc, data, len,
(sha1_block_fn *)sha1_xform);
sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform); /*covered*/
kernel_fpu_end();
return sha1_base_finish(desc, out); /*covered*/
}
asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
unsigned int rounds);
static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update(desc, data, len,
(sha1_transform_fn *) sha1_transform_ssse3);
}
static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out,
(sha1_transform_fn *) sha1_transform_ssse3);
}
/* Add padding and return the message digest. */
static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
{
return sha1_ssse3_finup(desc, NULL, 0, out);
}
static struct shash_alg sha1_ssse3_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ssse3_update,
.final = sha1_ssse3_final,
.finup = sha1_ssse3_finup,
.descsize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ssse3",
.cra_priority = 150,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int register_sha1_ssse3(void)
{
if (boot_cpu_has(X86_FEATURE_SSSE3))
return crypto_register_shash(&sha1_ssse3_alg);
return 0;
}
static void unregister_sha1_ssse3(void)
{
if (boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_shash(&sha1_ssse3_alg);
}
#ifdef CONFIG_AS_AVX
asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
unsigned int rounds);
static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update(desc, data, len,
(sha1_transform_fn *) sha1_transform_avx);
}
static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out,
(sha1_transform_fn *) sha1_transform_avx);
}
static int sha1_avx_final(struct shash_desc *desc, u8 *out)
{
return sha1_avx_finup(desc, NULL, 0, out);
}
static struct shash_alg sha1_avx_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_avx_update,
.final = sha1_avx_final,
.finup = sha1_avx_finup,
.descsize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx",
.cra_priority = 160,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static bool avx_usable(void)
{
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (boot_cpu_has(X86_FEATURE_AVX))
pr_info("AVX detected but unusable.\n");
return false;
}
return true;
}
static int register_sha1_avx(void)
{
if (avx_usable())
return crypto_register_shash(&sha1_avx_alg);
return 0;
}
static void unregister_sha1_avx(void)
{
if (avx_usable())
crypto_unregister_shash(&sha1_avx_alg);
}
#else /* CONFIG_AS_AVX */
static inline int register_sha1_avx(void) { return 0; }
static inline void unregister_sha1_avx(void) { }
#endif /* CONFIG_AS_AVX */
#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
unsigned int rounds);
static bool avx2_usable(void)
{
if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
&& boot_cpu_has(X86_FEATURE_BMI1)
&& boot_cpu_has(X86_FEATURE_BMI2))
return true;
return false;
}
static void sha1_apply_transform_avx2(u32 *digest, const char *data,
unsigned int rounds)
{
/* Select the optimal transform based on data block size */
if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) /*covered*/
sha1_transform_avx2(digest, data, rounds); /*covered*/
else
sha1_transform_avx(digest, data, rounds); /*covered*/
} /*covered*/
static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update(desc, data, len, /*covered*/
(sha1_transform_fn *) sha1_apply_transform_avx2);
}
static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out, /*covered*/
(sha1_transform_fn *) sha1_apply_transform_avx2);
}
static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
{
return sha1_avx2_finup(desc, NULL, 0, out); /*covered*/
}
static struct shash_alg sha1_avx2_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_avx2_update,
.final = sha1_avx2_final,
.finup = sha1_avx2_finup,
.descsize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx2",
.cra_priority = 170,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int register_sha1_avx2(void)
{
if (avx2_usable())
return crypto_register_shash(&sha1_avx2_alg);
return 0;
}
static void unregister_sha1_avx2(void)
{
if (avx2_usable())
crypto_unregister_shash(&sha1_avx2_alg);
}
#else
static inline int register_sha1_avx2(void) { return 0; }
static inline void unregister_sha1_avx2(void) { }
#endif
#ifdef CONFIG_AS_SHA1_NI
asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
unsigned int rounds);
static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update(desc, data, len,
(sha1_transform_fn *) sha1_ni_transform);
}
static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out,
(sha1_transform_fn *) sha1_ni_transform);
}
static int sha1_ni_final(struct shash_desc *desc, u8 *out)
{
return sha1_ni_finup(desc, NULL, 0, out);
}
static struct shash_alg sha1_ni_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ni_update,
.final = sha1_ni_final,
.finup = sha1_ni_finup,
.descsize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ni",
.cra_priority = 250,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int register_sha1_ni(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI))
return crypto_register_shash(&sha1_ni_alg);
return 0;
}
static void unregister_sha1_ni(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI))
crypto_unregister_shash(&sha1_ni_alg);
}
#else
static inline int register_sha1_ni(void) { return 0; }
static inline void unregister_sha1_ni(void) { }
#endif
static int __init sha1_ssse3_mod_init(void)
{
if (register_sha1_ssse3())
goto fail;
if (register_sha1_avx()) {
unregister_sha1_ssse3();
goto fail;
}
if (register_sha1_avx2()) {
unregister_sha1_avx();
unregister_sha1_ssse3();
goto fail;
}
if (register_sha1_ni()) {
unregister_sha1_avx2();
unregister_sha1_avx();
unregister_sha1_ssse3();
goto fail;
}
return 0;
fail:
return -ENODEV;
}
static void __exit sha1_ssse3_mod_fini(void)
{
unregister_sha1_ni();
unregister_sha1_avx2();
unregister_sha1_avx();
unregister_sha1_ssse3();
}
module_init(sha1_ssse3_mod_init);
module_exit(sha1_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS_CRYPTO("sha1");
MODULE_ALIAS_CRYPTO("sha1-ssse3");
MODULE_ALIAS_CRYPTO("sha1-avx");
MODULE_ALIAS_CRYPTO("sha1-avx2");
#ifdef CONFIG_AS_SHA1_NI
MODULE_ALIAS_CRYPTO("sha1-ni");
#endif
/*
* Cryptographic API.
*
* Glue code for the SHA256 Secure Hash Algorithm assembler
* implementation using supplemental SSE3 / AVX / AVX2 instructions.
*
* This file is based on sha256_generic.c
*
* Copyright (C) 2013 Intel Corporation.
*
* Author:
* Tim Chen <tim.c.chen@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/cryptohash.h>
#include <linux/types.h>
#include <crypto/sha.h>
#include <crypto/sha256_base.h>
#include <asm/fpu/api.h>
#include <linux/string.h>
asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
u64 rounds);
typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
static int sha256_update(struct shash_desc *desc, const u8 *data, /*covered*/
unsigned int len, sha256_transform_fn *sha256_xform)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
if (!irq_fpu_usable() || /*covered*/
(sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) /*covered*/
return crypto_sha256_update(desc, data, len); /*covered*/
/* make sure casting to sha256_block_fn() is safe */
BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
kernel_fpu_begin(); /*covered*/
sha256_base_do_update(desc, data, len, /*covered*/
(sha256_block_fn *)sha256_xform);
kernel_fpu_end(); /*covered*/
return 0;
}
static int sha256_finup(struct shash_desc *desc, const u8 *data, /*covered*/
unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
{
if (!irq_fpu_usable()) /*covered*/
return crypto_sha256_finup(desc, data, len, out);
kernel_fpu_begin(); /*covered*/
if (len)
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha256_xform);
sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform); /*covered*/
kernel_fpu_end();
return sha256_base_finish(desc, out); /*covered*/
}
static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha256_update(desc, data, len, sha256_transform_ssse3);
}
static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
}
/* Add padding and return the message digest. */
static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
{
return sha256_ssse3_finup(desc, NULL, 0, out);
}
static struct shash_alg sha256_ssse3_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_ssse3_update,
.final = sha256_ssse3_final,
.finup = sha256_ssse3_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ssse3",
.cra_priority = 150,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
}, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
.update = sha256_ssse3_update,
.final = sha256_ssse3_final,
.finup = sha256_ssse3_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ssse3",
.cra_priority = 150,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
} };
static int register_sha256_ssse3(void)
{
if (boot_cpu_has(X86_FEATURE_SSSE3))
return crypto_register_shashes(sha256_ssse3_algs,
ARRAY_SIZE(sha256_ssse3_algs));
return 0;
}
static void unregister_sha256_ssse3(void)
{
if (boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_shashes(sha256_ssse3_algs,
ARRAY_SIZE(sha256_ssse3_algs));
}
#ifdef CONFIG_AS_AVX
asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
u64 rounds);
static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha256_update(desc, data, len, sha256_transform_avx);
}
static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha256_finup(desc, data, len, out, sha256_transform_avx);
}
static int sha256_avx_final(struct shash_desc *desc, u8 *out)
{
return sha256_avx_finup(desc, NULL, 0, out);
}
static struct shash_alg sha256_avx_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_avx_update,
.final = sha256_avx_final,
.finup = sha256_avx_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-avx",
.cra_priority = 160,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
}, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
.update = sha256_avx_update,
.final = sha256_avx_final,
.finup = sha256_avx_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-avx",
.cra_priority = 160,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
} };
static bool avx_usable(void)
{
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (boot_cpu_has(X86_FEATURE_AVX))
pr_info("AVX detected but unusable.\n");
return false;
}
return true;
}
static int register_sha256_avx(void)
{
if (avx_usable())
return crypto_register_shashes(sha256_avx_algs,
ARRAY_SIZE(sha256_avx_algs));
return 0;
}
static void unregister_sha256_avx(void)
{
if (avx_usable())
crypto_unregister_shashes(sha256_avx_algs,
ARRAY_SIZE(sha256_avx_algs));
}
#else
static inline int register_sha256_avx(void) { return 0; }
static inline void unregister_sha256_avx(void) { }
#endif
#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
u64 rounds);
static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha256_update(desc, data, len, sha256_transform_rorx); /*covered*/
}
static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha256_finup(desc, data, len, out, sha256_transform_rorx); /*covered*/
}
static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
{
return sha256_avx2_finup(desc, NULL, 0, out); /*covered*/
}
static struct shash_alg sha256_avx2_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_avx2_update,
.final = sha256_avx2_final,
.finup = sha256_avx2_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-avx2",
.cra_priority = 170,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
}, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
.update = sha256_avx2_update,
.final = sha256_avx2_final,
.finup = sha256_avx2_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-avx2",
.cra_priority = 170,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
} };
static bool avx2_usable(void)
{
if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
boot_cpu_has(X86_FEATURE_BMI2))
return true;
return false;
}
static int register_sha256_avx2(void)
{
if (avx2_usable())
return crypto_register_shashes(sha256_avx2_algs,
ARRAY_SIZE(sha256_avx2_algs));
return 0;
}
static void unregister_sha256_avx2(void)
{
if (avx2_usable())
crypto_unregister_shashes(sha256_avx2_algs,
ARRAY_SIZE(sha256_avx2_algs));
}
#else
static inline int register_sha256_avx2(void) { return 0; }
static inline void unregister_sha256_avx2(void) { }
#endif
#ifdef CONFIG_AS_SHA256_NI
asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
u64 rounds); /*unsigned int rounds);*/
static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha256_update(desc, data, len, sha256_ni_transform);
}
static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha256_finup(desc, data, len, out, sha256_ni_transform);
}
static int sha256_ni_final(struct shash_desc *desc, u8 *out)
{
return sha256_ni_finup(desc, NULL, 0, out);
}
static struct shash_alg sha256_ni_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_ni_update,
.final = sha256_ni_final,
.finup = sha256_ni_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ni",
.cra_priority = 250,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
}, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
.update = sha256_ni_update,
.final = sha256_ni_final,
.finup = sha256_ni_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ni",
.cra_priority = 250,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
} };
static int register_sha256_ni(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI))
return crypto_register_shashes(sha256_ni_algs,
ARRAY_SIZE(sha256_ni_algs));
return 0;
}
static void unregister_sha256_ni(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI))
crypto_unregister_shashes(sha256_ni_algs,
ARRAY_SIZE(sha256_ni_algs));
}
#else
static inline int register_sha256_ni(void) { return 0; }
static inline void unregister_sha256_ni(void) { }
#endif
static int __init sha256_ssse3_mod_init(void)
{
if (register_sha256_ssse3())
goto fail;
if (register_sha256_avx()) {
unregister_sha256_ssse3();
goto fail;
}
if (register_sha256_avx2()) {
unregister_sha256_avx();
unregister_sha256_ssse3();
goto fail;
}
if (register_sha256_ni()) {
unregister_sha256_avx2();
unregister_sha256_avx();
unregister_sha256_ssse3();
goto fail;
}
return 0;
fail:
return -ENODEV;
}
static void __exit sha256_ssse3_mod_fini(void)
{
unregister_sha256_ni();
unregister_sha256_avx2();
unregister_sha256_avx();
unregister_sha256_ssse3();
}
module_init(sha256_ssse3_mod_init);
module_exit(sha256_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS_CRYPTO("sha256");
MODULE_ALIAS_CRYPTO("sha256-ssse3");
MODULE_ALIAS_CRYPTO("sha256-avx");
MODULE_ALIAS_CRYPTO("sha256-avx2");
MODULE_ALIAS_CRYPTO("sha224");
MODULE_ALIAS_CRYPTO("sha224-ssse3");
MODULE_ALIAS_CRYPTO("sha224-avx");
MODULE_ALIAS_CRYPTO("sha224-avx2");
#ifdef CONFIG_AS_SHA256_NI
MODULE_ALIAS_CRYPTO("sha256-ni");
MODULE_ALIAS_CRYPTO("sha224-ni");
#endif
/*
* common.c - C code for kernel entry and exit
* Copyright (c) 2015 Andrew Lutomirski
* GPL v2
*
* Based on asm and ptrace code by many authors. The code here originated
* in ptrace.c and signal.c.
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/nospec.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible inline void enter_from_user_mode(void)
{
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit_irqoff();
}
#else
static inline void enter_from_user_mode(void) {}
#endif
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) { /*covered*/
audit_syscall_entry(regs->orig_ax, regs->di, /*covered*/
regs->si, regs->dx, regs->r10);
} else
#endif
{
audit_syscall_entry(regs->orig_ax, regs->bx,
regs->cx, regs->dx, regs->si);
}
}
/*
* Returns the syscall nr to run (which should match regs->orig_ax) or -1
* to skip the syscall.
*/
static long syscall_trace_enter(struct pt_regs *regs)
{
u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; /*covered*/
struct thread_info *ti = current_thread_info(); /*covered*/
unsigned long ret = 0;
bool emulated = false;
u32 work;
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
BUG_ON(regs != task_pt_regs(current));
work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
if (unlikely(work & _TIF_SYSCALL_EMU))
emulated = true;
if ((emulated || (work & _TIF_SYSCALL_TRACE)) && /*covered*/
tracehook_report_syscall_entry(regs))
return -1L;
if (emulated)
return -1L;
#ifdef CONFIG_SECCOMP
/*
* Do seccomp after ptrace, to catch any tracer changes.
*/
if (work & _TIF_SECCOMP) { /*covered*/
struct seccomp_data sd;
sd.arch = arch;
sd.nr = regs->orig_ax;
sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
sd.args[0] = regs->di;
sd.args[1] = regs->si;
sd.args[2] = regs->dx;
sd.args[3] = regs->r10;
sd.args[4] = regs->r8;
sd.args[5] = regs->r9;
} else
#endif
{
sd.args[0] = regs->bx;
sd.args[1] = regs->cx;
sd.args[2] = regs->dx;
sd.args[3] = regs->si;
sd.args[4] = regs->di;
sd.args[5] = regs->bp;
}
ret = __secure_computing(&sd);
if (ret == -1)
return ret;
}
#endif
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) /*covered*/
trace_sys_enter(regs, regs->orig_ax); /*covered*/
do_audit_syscall_entry(regs, arch); /*covered*/
return ret ?: regs->orig_ax; /*covered*/
}
#define EXIT_TO_USERMODE_LOOP_FLAGS \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
/*
* In order to return to user mode, we need to have IRQs off with
* none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags
* can be set at any time on preemptible kernels if we have IRQs on,
* so we need to loop. Disabling preemption wouldn't help: doing the
* work to clear some of the flags can sleep.
*/
while (true) {
/* We have work to do. */
local_irq_enable(); /*covered*/
if (cached_flags & _TIF_NEED_RESCHED)
schedule(); /*covered*/
if (cached_flags & _TIF_UPROBE) /*covered*/
uprobe_notify_resume(regs);
if (cached_flags & _TIF_PATCH_PENDING)
klp_update_patch_state(current); /*covered*/
/* deal with pending signal delivery */
if (cached_flags & _TIF_SIGPENDING) /*covered*/
do_signal(regs);
if (cached_flags & _TIF_NOTIFY_RESUME) { /*covered*/
clear_thread_flag(TIF_NOTIFY_RESUME); /*covered*/
tracehook_notify_resume(regs); /*covered*/
rseq_handle_notify_resume(NULL, regs);
}
if (cached_flags & _TIF_USER_RETURN_NOTIFY) /*covered*/
fire_user_return_notifiers();
/* Disable IRQs and retry */
local_irq_disable(); /*covered*/
cached_flags = READ_ONCE(current_thread_info()->flags);
if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
break;
}
} /*covered*/
/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info(); /*covered*/
u32 cached_flags;
addr_limit_user_check(); /*covered*/
lockdep_assert_irqs_disabled(); /*covered*/
lockdep_sys_exit(); /*covered*/
cached_flags = READ_ONCE(ti->flags);
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags); /*covered*/
#ifdef CONFIG_COMPAT
/*
* Compat syscalls set TS_COMPAT. Make sure we clear it before
* returning to user mode. We need to clear it *after* signal
* handling, because syscall restart has a fixup for compat
* syscalls. The fixup is exercised by the ptrace_syscall_32
* selftest.
*
* We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
* special case only applies after poking regs and before the
* very next return to user mode.
*/
ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); /*covered*/
#endif
user_enter_irqoff();
}
#define SYSCALL_EXIT_WORK_FLAGS \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
{
bool step;
audit_syscall_exit(regs); /*covered*/
if (cached_flags & _TIF_SYSCALL_TRACEPOINT) /*covered*/
trace_sys_exit(regs, regs->ax); /*covered*/
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
* We already reported this syscall instruction in
* syscall_trace_enter().
*/
step = unlikely( /*covered*/
(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
== _TIF_SINGLESTEP);
if (step || cached_flags & _TIF_SYSCALL_TRACE) /*covered*/
tracehook_report_syscall_exit(regs, step);
} /*covered*/
/*
* Called with IRQs on and fully valid regs. Returns with IRQs off in a
* state such that we can immediately switch to user mode.
*/
__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info(); /*covered*/
u32 cached_flags = READ_ONCE(ti->flags);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) /*covered*/
local_irq_enable();
rseq_syscall(regs);
/*
* First do one-time work. If these work items are enabled, we
* want to run them exactly once per syscall exit with IRQs on.
*/
if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) /*covered*/
syscall_slow_exit_work(regs, cached_flags); /*covered*/
local_irq_disable(); /*covered*/
prepare_exit_to_usermode(regs); /*covered*/
}
#ifdef CONFIG_X86_64
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
struct thread_info *ti;
enter_from_user_mode();
local_irq_enable(); /*covered*/
ti = current_thread_info();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
nr = syscall_trace_enter(regs); /*covered*/
/*
* NB: Native and x32 syscalls are dispatched from the same
* table. The only functional difference is the x32 bit in
* regs->orig_ax, which changes the behavior of some syscalls.
*/
nr &= __SYSCALL_MASK; /*covered*/
if (likely(nr < NR_syscalls)) {
nr = array_index_nospec(nr, NR_syscalls); /*covered*/
regs->ax = sys_call_table[nr](regs);
}
syscall_return_slowpath(regs); /*covered*/
}
#endif
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
* Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
* all entry and exit work and returns with IRQs off. This function is
* extremely hot in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info();
unsigned int nr = (unsigned int)regs->orig_ax;
#ifdef CONFIG_IA32_EMULATION
ti->status |= TS_COMPAT;
#endif
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
/*
* Subtlety here: if ptrace pokes something larger than
* 2^32-1 into orig_ax, this truncates it. This may or
* may not be necessary, but it matches the old asm
* behavior.
*/
nr = syscall_trace_enter(regs);
}
if (likely(nr < IA32_NR_syscalls)) {
nr = array_index_nospec(nr, IA32_NR_syscalls);
#ifdef CONFIG_IA32_EMULATION
regs->ax = ia32_sys_call_table[nr](regs);
#else
/*
* It's possible that a 32-bit syscall implementation
* takes a 64-bit parameter but nonetheless assumes that
* the high bits are zero. Make sure we zero-extend all
* of the args.
*/
regs->ax = ia32_sys_call_table[nr](
(unsigned int)regs->bx, (unsigned int)regs->cx,
(unsigned int)regs->dx, (unsigned int)regs->si,
(unsigned int)regs->di, (unsigned int)regs->bp);
#endif /* CONFIG_IA32_EMULATION */
}
syscall_return_slowpath(regs);
}
/* Handles int $0x80 */
__visible void do_int80_syscall_32(struct pt_regs *regs)
{
enter_from_user_mode();
local_irq_enable();
do_syscall_32_irqs_on(regs);
}
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
* convention. Adjust regs so it looks like we entered using int80.
*/
unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
vdso_image_32.sym_int80_landing_pad;
/*
* SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
* so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
* Fix it up.
*/
regs->ip = landing_pad;
enter_from_user_mode();
local_irq_enable();
/* Fetch EBP from where the vDSO stashed it. */
if (
#ifdef CONFIG_X86_64
/*
* Micro-optimization: the pointer we're following is explicitly
* 32 bits, so it can't be out of range.
*/
__get_user(*(u32 *)®s->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
get_user(*(u32 *)®s->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
) {
/* User code screwed up. */
local_irq_disable();
regs->ax = -EFAULT;
prepare_exit_to_usermode(regs);
return 0; /* Keep it simple: use IRET. */
}
/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs);
#ifdef CONFIG_X86_64
/*
* Opportunistic SYSRETL: if possible, try to return using SYSRETL.
* SYSRETL is available on all 64-bit CPUs, so we don't need to
* bother with SYSEXIT.
*
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
* because the ECX fixup above will ensure that this is essentially
* never the case.
*/
return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
/*
* Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
*
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
* because the ECX fixup above will ensure that this is essentially
* never the case.
*
* We don't allow syscalls at all from VM86 mode, but we still
* need to check VM, because we might be returning from sys_vm86.
*/
return static_cpu_has(X86_FEATURE_SEP) &&
regs->cs == __USER_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
}
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
*
* Based on the original implementation which is:
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
* Parts of the original code have been moved to arch/x86/vdso/vma.c
*
* This file implements vsyscall emulation. vsyscalls are a legacy ABI:
* Userspace can request certain kernel services by calling fixed
* addresses. This concept is problematic:
*
* - It interferes with ASLR.
* - It's awkward to write code that lives in kernel addresses but is
* callable by userspace at fixed addresses.
* - The whole concept is impossible for 32-bit compat userspace.
* - UML cannot easily virtualize a vsyscall.
*
* As of mid-2014, I believe that there is no new userspace code that
* will use a vsyscall if the vDSO is present. I hope that there will
* soon be no new userspace code that will ever use a vsyscall.
*
* The code in this file emulates vsyscalls when notified of a page
* fault to a vsyscall address.
*/
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/mm_types.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>
#include <asm/vsyscall.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
#include <asm/paravirt.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
static enum { EMULATE, NONE } vsyscall_mode =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
NONE;
#else
EMULATE;
#endif
static int __init vsyscall_setup(char *str)
{
if (str) {
if (!strcmp("emulate", str))
vsyscall_mode = EMULATE;
else if (!strcmp("none", str))
vsyscall_mode = NONE;
else
return -EINVAL;
return 0;
}
return -EINVAL;
}
early_param("vsyscall", vsyscall_setup);
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
const char *message)
{
if (!show_unhandled_signals)
return;
printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
level, current->comm, task_pid_nr(current),
message, regs->ip, regs->cs,
regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
{
int nr;
if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
return -EINVAL;
nr = (addr & 0xC00UL) >> 10;
if (nr >= 3)
return -EINVAL;
return nr;
}
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
/*
* XXX: if access_ok, get_user, and put_user handled
* sig_on_uaccess_err, this could go away.
*/
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = ¤t->thread;
thread->error_code = X86_PF_USER | X86_PF_WRITE;
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr, current);
return false;
} else {
return true;
}
}
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
*/
WARN_ON_ONCE(address != regs->ip);
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
return false;
}
vsyscall_nr = addr_to_vsyscall_nr(address);
trace_emulate_vsyscall(vsyscall_nr);
if (vsyscall_nr < 0) {
warn_bad_vsyscall(KERN_WARNING, regs,
"misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
goto sigsegv;
}
if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
warn_bad_vsyscall(KERN_WARNING, regs,
"vsyscall with bad stack (exploit attempt?)");
goto sigsegv;
}
tsk = current;
/*
* Check for access_ok violations and find the syscall nr.
*
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
* 64-bit, so we don't need to special-case it here. For all the
* vsyscalls, NULL means "don't write anything" not "write it at
* address 0".
*/
switch (vsyscall_nr) {
case 0:
if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
!write_ok_or_segv(regs->si, sizeof(struct timezone))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_gettimeofday;
break;
case 1:
if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_time;
break;
case 2:
if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
!write_ok_or_segv(regs->si, sizeof(unsigned))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_getcpu;
break;
}
/*
* Handle seccomp. regs->ip must be the original value.
* See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
*
* We could optimize the seccomp disabled case, but performance
* here doesn't matter.
*/
regs->orig_ax = syscall_nr;
regs->ax = -ENOSYS;
tmp = secure_computing(NULL);
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
do_exit(SIGSYS);
}
regs->orig_ax = -1;
if (tmp)
goto do_ret; /* skip requested */
/*
* With a real vsyscall, page faults cause SIGSEGV. We want to
* preserve that behavior to make writing exploits harder.
*/
prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
current->thread.sig_on_uaccess_err = 1;
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
/* this decodes regs->di and regs->si on its own */
ret = __x64_sys_gettimeofday(regs);
break;
case 1:
/* this decodes regs->di on its own */
ret = __x64_sys_time(regs);
break;
case 2:
/* while we could clobber regs->dx, we didn't in the past... */
orig_dx = regs->dx;
regs->dx = 0;
/* this decodes regs->di, regs->si and regs->dx on its own */
ret = __x64_sys_getcpu(regs);
regs->dx = orig_dx;
break;
}
current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
/*
* If we failed to generate a signal for any reason,
* generate one here. (This should be impossible.)
*/
if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
!sigismember(&tsk->pending.signal, SIGSEGV)))
goto sigsegv;
return true; /* Don't emulate the ret. */
}
regs->ax = ret;
do_ret:
/* Emulate a ret instruction. */
regs->ip = caller;
regs->sp += 8;
return true;
sigsegv:
force_sig(SIGSEGV, current);
return true;
}
/*
* A pseudo VMA to allow ptrace access for the vsyscall page. This only
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
static const char *gate_vma_name(struct vm_area_struct *vma)
{
return "[vsyscall]";
}
static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
static struct vm_area_struct gate_vma = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
.vm_flags = VM_READ | VM_EXEC,
.vm_ops = &gate_vma_ops,
};
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
if (!mm || mm->context.ia32_compat) /*covered*/
return NULL;
#endif
if (vsyscall_mode == NONE) /*covered*/
return NULL;
return &gate_vma;
}
int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = get_gate_vma(mm); /*covered*/
if (!vma)
return 0;
return (addr >= vma->vm_start) && (addr < vma->vm_end); /*covered*/
}
/*
* Use this when you have no reliable mm, typically from interrupt
* context. It is less reliable than using a task's mm and may give
* false positives.
*/
int in_gate_area_no_mm(unsigned long addr)
{
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
/*
* The VSYSCALL page is the only user-accessible page in the kernel address
* range. Normally, the kernel page tables can have _PAGE_USER clear, but
* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
* are enabled.
*
* Some day we may create a "minimal" vsyscall mode in which we emulate
* vsyscalls but leave the page not present. If so, we skip calling
* this.
*/
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
#endif
pud = pud_offset(p4d, VSYSCALL_ADDR);
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
pmd = pmd_offset(pud, VSYSCALL_ADDR);
set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}
void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
if (vsyscall_mode != NONE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}
/*
* Performance events x86 architecture code
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kdebug.h>
#include <linux/sched/mm.h>
#include <linux/sched/clock.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/device.h>
#include <linux/nospec.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
#include <asm/unwind.h>
#include "perf_event.h"
struct x86_pmu x86_pmu __read_mostly;
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};
DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
u64 __read_mostly hw_cache_extra_regs
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
/*
* Propagate event elapsed time into the generic event.
* Can only be executed on the CPU where the event is active.
* Returns the delta events processed.
*/
u64 x86_perf_event_update(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
int shift = 64 - x86_pmu.cntval_bits;
u64 prev_raw_count, new_raw_count;
int idx = hwc->idx;
u64 delta;
if (idx == INTEL_PMC_IDX_FIXED_BTS)
return 0;
/*
* Careful: an NMI might modify the previous event value.
*
* Our tactic to handle this is to first atomically read and
* exchange a new raw count - then add that new-prev delta
* count to the generic event atomically:
*/
again:
prev_raw_count = local64_read(&hwc->prev_count);
rdpmcl(hwc->event_base_rdpmc, new_raw_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
goto again;
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
* (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
local64_add(delta, &event->count);
local64_sub(delta, &hwc->period_left);
return new_raw_count;
}
/*
* Find and validate any extra registers to set up.
*/
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
struct hw_perf_event_extra *reg;
struct extra_reg *er;
reg = &event->hw.extra_reg;
if (!x86_pmu.extra_regs)
return 0;
for (er = x86_pmu.extra_regs; er->msr; er++) {
if (er->event != (config & er->config_mask))
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
/* Check if the extra msrs can be safely accessed*/
if (!er->extra_msr_access)
return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
reg->reg = er->msr;
break;
}
return 0;
}
static atomic_t active_events;
static atomic_t pmc_refcount;
static DEFINE_MUTEX(pmc_reserve_mutex);
#ifdef CONFIG_X86_LOCAL_APIC
static bool reserve_pmc_hardware(void)
{
int i;
for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
goto perfctr_fail;
}
for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
goto eventsel_fail;
}
return true;
eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu_config_addr(i));
i = x86_pmu.num_counters;
perfctr_fail:
for (i--; i >= 0; i--)
release_perfctr_nmi(x86_pmu_event_addr(i));
return false;
}
static void release_pmc_hardware(void)
{
int i;
for (i = 0; i < x86_pmu.num_counters; i++) {
release_perfctr_nmi(x86_pmu_event_addr(i));
release_evntsel_nmi(x86_pmu_config_addr(i));
}
}
#else
static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}
#endif
static bool check_hw_exists(void)
{
u64 val, val_fail = -1, val_new= ~0;
int i, reg, reg_fail = -1, ret = 0;
int bios_fail = 0;
int reg_safe = -1;
/*
* Check to see if the BIOS enabled any of the counters, if so
* complain and bail.
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
} else {
reg_safe = i;
}
}
if (x86_pmu.num_counters_fixed) {
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
if (val & (0x03 << i*4)) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
}
}
}
/*
* If all the counters are enabled, the below test will always
* fail. The tools will also become useless in this scenario.
* Just fail and disable the hardware counters.
*/
if (reg_safe == -1) {
reg = reg_safe;
goto msr_fail;
}
/*
* Read the current value, change it and read it back to see if it
* matches, this is needed to detect certain hardware emulators
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/
reg = x86_pmu_event_addr(reg_safe);
if (rdmsrl_safe(reg, &val))
goto msr_fail;
val ^= 0xffffUL;
ret = wrmsrl_safe(reg, val);
ret |= rdmsrl_safe(reg, &val_new);
if (ret || val != val_new)
goto msr_fail;
/*
* We still allow the PMU driver to operate:
*/
if (bios_fail) {
pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
reg_fail, val_fail);
}
return true;
msr_fail:
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
pr_cont("PMU not available due to virtualization, using software events only.\n");
} else {
pr_cont("Broken PMU hardware detected, using software events only.\n");
pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
reg, val_new);
}
return false;
}
static void hw_perf_event_destroy(struct perf_event *event)
{
x86_release_hardware();
atomic_dec(&active_events);
}
void hw_perf_lbr_event_destroy(struct perf_event *event)
{
hw_perf_event_destroy(event);
/* undo the lbr/bts event accounting */
x86_del_exclusive(x86_lbr_exclusive_lbr);
}
static inline int x86_pmu_initialized(void)
{
return x86_pmu.handle_irq != NULL;
}
static inline int
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
unsigned int cache_type, cache_op, cache_result;
u64 config, val;
config = attr->config;
cache_type = (config >> 0) & 0xff;
if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
return -EINVAL;
cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
cache_op = (config >> 8) & 0xff;
if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
return -EINVAL;
cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
cache_result = (config >> 16) & 0xff;
if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
return -EINVAL;
cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
val = hw_cache_event_ids[cache_type][cache_op][cache_result];
if (val == 0)
return -ENOENT;
if (val == -1)
return -EINVAL;
hwc->config |= val;
attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
return x86_pmu_extra_regs(val, event);
}
int x86_reserve_hardware(void)
{
int err = 0;
if (!atomic_inc_not_zero(&pmc_refcount)) {
mutex_lock(&pmc_reserve_mutex);
if (atomic_read(&pmc_refcount) == 0) {
if (!reserve_pmc_hardware())
err = -EBUSY;
else
reserve_ds_buffers();
}
if (!err)
atomic_inc(&pmc_refcount);
mutex_unlock(&pmc_reserve_mutex);
}
return err;
}
void x86_release_hardware(void)
{
if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
release_pmc_hardware();
release_ds_buffers();
mutex_unlock(&pmc_reserve_mutex);
}
}
/*
* Check if we can create event of a certain type (that no conflicting events
* are present).
*/
int x86_add_exclusive(unsigned int what)
{
int i;
/*
* When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
* LBR and BTS are still mutually exclusive.
*/
if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
return 0;
if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
mutex_lock(&pmc_reserve_mutex);
for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
goto fail_unlock;
}
atomic_inc(&x86_pmu.lbr_exclusive[what]);
mutex_unlock(&pmc_reserve_mutex);
}
atomic_inc(&active_events);
return 0;
fail_unlock:
mutex_unlock(&pmc_reserve_mutex);
return -EBUSY;
}
void x86_del_exclusive(unsigned int what)
{
if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
return;
atomic_dec(&x86_pmu.lbr_exclusive[what]);
atomic_dec(&active_events);
}
int x86_setup_perfctr(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
struct hw_perf_event *hwc = &event->hw;
u64 config;
if (!is_sampling_event(event)) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
}
if (attr->type == PERF_TYPE_RAW)
return x86_pmu_extra_regs(event->attr.config, event);
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, event);
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
/*
* The generic map:
*/
config = x86_pmu.event_map(attr->config);
if (config == 0)
return -ENOENT;
if (config == -1LL)
return -EINVAL;
hwc->config |= config;
return 0;
}
/*
* check that branch_sample_type is compatible with
* settings needed for precise_ip > 1 which implies
* using the LBR to capture ALL taken branches at the
* priv levels of the measurement
*/
static inline int precise_br_compat(struct perf_event *event)
{
u64 m = event->attr.branch_sample_type;
u64 b = 0;
/* must capture all branches */
if (!(m & PERF_SAMPLE_BRANCH_ANY))
return 0;
m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
if (!event->attr.exclude_user)
b |= PERF_SAMPLE_BRANCH_USER;
if (!event->attr.exclude_kernel)
b |= PERF_SAMPLE_BRANCH_KERNEL;
/*
* ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
*/
return m == b;
}
int x86_pmu_max_precise(void)
{
int precise = 0;
/* Support for constant skid */
if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
precise++;
/* Support for IP fixup */
if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
if (x86_pmu.pebs_prec_dist)
precise++;
}
return precise;
}
int x86_pmu_hw_config(struct perf_event *event)
{
if (event->attr.precise_ip) {
int precise = x86_pmu_max_precise();
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
/* There's no sense in having PEBS for non sampling events: */
if (!is_sampling_event(event))
return -EINVAL;
}
/*
* check that PEBS LBR correction does not conflict with
* whatever the user is asking with attr->branch_sample_type
*/
if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
u64 *br_type = &event->attr.branch_sample_type;
if (has_branch_stack(event)) {
if (!precise_br_compat(event))
return -EOPNOTSUPP;
/* branch_sample_type is compatible */
} else {
/*
* user did not specify branch_sample_type
*
* For PEBS fixups, we capture all
* the branches at the priv level of the
* event.
*/
*br_type = PERF_SAMPLE_BRANCH_ANY;
if (!event->attr.exclude_user)
*br_type |= PERF_SAMPLE_BRANCH_USER;
if (!event->attr.exclude_kernel)
*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
}
}
if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
event->attach_state |= PERF_ATTACH_TASK_DATA;
/*
* Generate PMC IRQs:
* (keep 'enabled' bit clear for now)
*/
event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
/*
* Count user and OS events unless requested not to
*/
if (!event->attr.exclude_user)
event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
if (!event->attr.exclude_kernel)
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
if (event->attr.sample_period && x86_pmu.limit_period) {
if (x86_pmu.limit_period(event, event->attr.sample_period) >
event->attr.sample_period)
return -EINVAL;
}
return x86_setup_perfctr(event);
}
/*
* Setup the hardware configuration for a given attr_type
*/
static int __x86_pmu_event_init(struct perf_event *event)
{
int err;
if (!x86_pmu_initialized())
return -ENODEV;
err = x86_reserve_hardware();
if (err)
return err;
atomic_inc(&active_events);
event->destroy = hw_perf_event_destroy;
event->hw.idx = -1;
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;
/* mark unused */
event->hw.extra_reg.idx = EXTRA_REG_NONE;
event->hw.branch_reg.idx = EXTRA_REG_NONE;
return x86_pmu.hw_config(event);
}
void x86_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
u64 val;
if (!test_bit(idx, cpuc->active_mask))
continue;
rdmsrl(x86_pmu_config_addr(idx), val);
if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
continue;
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(x86_pmu_config_addr(idx), val);
}
}
/*
* There may be PMI landing after enabled=0. The PMI hitting could be before or
* after disable_all.
*
* If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
* It will not be re-enabled in the NMI handler again, because enabled=0. After
* handling the NMI, disable_all will be called, which will not change the
* state either. If PMI hits after disable_all, the PMU is already disabled
* before entering NMI handler. The NMI handler will not change the state
* either.
*
* So either situation is harmless.
*/
static void x86_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu_initialized())
return;
if (!cpuc->enabled)
return;
cpuc->n_added = 0;
cpuc->enabled = 0;
barrier();
x86_pmu.disable_all();
}
void x86_pmu_enable_all(int added)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
if (!test_bit(idx, cpuc->active_mask))
continue;
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
}
static struct pmu pmu;
static inline int is_x86_event(struct perf_event *event)
{
return event->pmu == &pmu;
}
/*
* Event scheduler state:
*
* Assign events iterating over all events and counters, beginning
* with events with least weights first. Keep the current iterator
* state in struct sched_state.
*/
struct sched_state {
int weight;
int event; /* event index */
int counter; /* counter index */
int unassigned; /* number of events to be assigned left */
int nr_gp; /* number of GP counters used */
unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define SCHED_STATES_MAX 2
struct perf_sched {
int max_weight;
int max_events;
int max_gp;
int saved_states;
struct event_constraint **constraints;
struct sched_state state;
struct sched_state saved[SCHED_STATES_MAX];
};
/*
* Initialize interator that runs through all events and counters.
*/
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
int num, int wmin, int wmax, int gpmax)
{
int idx;
memset(sched, 0, sizeof(*sched));
sched->max_events = num;
sched->max_weight = wmax;
sched->max_gp = gpmax;
sched->constraints = constraints;
for (idx = 0; idx < num; idx++) {
if (constraints[idx]->weight == wmin)
break;
}
sched->state.event = idx; /* start with min weight */
sched->state.weight = wmin;
sched->state.unassigned = num;
}
static void perf_sched_save_state(struct perf_sched *sched)
{
if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
return;
sched->saved[sched->saved_states] = sched->state;
sched->saved_states++;
}
static bool perf_sched_restore_state(struct perf_sched *sched)
{
if (!sched->saved_states)
return false;
sched->saved_states--;
sched->state = sched->saved[sched->saved_states];
/* continue with next counter: */
clear_bit(sched->state.counter++, sched->state.used);
return true;
}
/*
* Select a counter for the current event to schedule. Return true on
* success.
*/
static bool __perf_sched_find_counter(struct perf_sched *sched)
{
struct event_constraint *c;
int idx;
if (!sched->state.unassigned)
return false;
if (sched->state.event >= sched->max_events)
return false;
c = sched->constraints[sched->state.event];
/* Prefer fixed purpose counters */
if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
idx = INTEL_PMC_IDX_FIXED;
for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (!__test_and_set_bit(idx, sched->state.used))
goto done;
}
}
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
if (!__test_and_set_bit(idx, sched->state.used)) {
if (sched->state.nr_gp++ >= sched->max_gp)
return false;
goto done;
}
}
return false;
done:
sched->state.counter = idx;
if (c->overlap)
perf_sched_save_state(sched);
return true;
}
static bool perf_sched_find_counter(struct perf_sched *sched)
{
while (!__perf_sched_find_counter(sched)) {
if (!perf_sched_restore_state(sched))
return false;
}
return true;
}
/*
* Go through all unassigned events and find the next one to schedule.
* Take events with the least weight first. Return true on success.
*/
static bool perf_sched_next_event(struct perf_sched *sched)
{
struct event_constraint *c;
if (!sched->state.unassigned || !--sched->state.unassigned)
return false;
do {
/* next event */
sched->state.event++;
if (sched->state.event >= sched->max_events) {
/* next weight */
sched->state.event = 0;
sched->state.weight++;
if (sched->state.weight > sched->max_weight)
return false;
}
c = sched->constraints[sched->state.event];
} while (c->weight != sched->state.weight);
sched->state.counter = 0; /* start with first counter */
return true;
}
/*
* Assign a counter for each event.
*/
int perf_assign_events(struct event_constraint **constraints, int n,
int wmin, int wmax, int gpmax, int *assign)
{
struct perf_sched sched;
perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
do {
if (!perf_sched_find_counter(&sched))
break; /* failed */
if (assign)
assign[sched.state.event] = sched.state.counter;
} while (perf_sched_next_event(&sched));
return sched.state.unassigned;
}
EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
struct event_constraint *c;
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
struct perf_event *e;
int i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
if (x86_pmu.start_scheduling)
x86_pmu.start_scheduling(cpuc);
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
cpuc->event_constraint[i] = NULL;
c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
cpuc->event_constraint[i] = c;
wmin = min(wmin, c->weight);
wmax = max(wmax, c->weight);
}
/*
* fastpath, try to reuse previous register
*/
for (i = 0; i < n; i++) {
hwc = &cpuc->event_list[i]->hw;
c = cpuc->event_constraint[i];
/* never assigned */
if (hwc->idx == -1)
break;
/* constraint still honored */
if (!test_bit(hwc->idx, c->idxmsk))
break;
/* not already used */
if (test_bit(hwc->idx, used_mask))
break;
__set_bit(hwc->idx, used_mask);
if (assign)
assign[i] = hwc->idx;
}
/* slow path */
if (i != n) {
int gpmax = x86_pmu.num_counters;
/*
* Do not allow scheduling of more than half the available
* generic counters.
*
* This helps avoid counter starvation of sibling thread by
* ensuring at most half the counters cannot be in exclusive
* mode. There is no designated counters for the limits. Any
* N/2 counters can be used. This helps with events with
* specific counter constraints.
*/
if (is_ht_workaround_enabled() && !cpuc->is_fake &&
READ_ONCE(cpuc->excl_cntrs->exclusive_present))
gpmax /= 2;
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
wmax, gpmax, assign);
}
/*
* In case of success (unsched = 0), mark events as committed,
* so we do not put_constraint() in case new events are added
* and fail to be scheduled
*
* We invoke the lower level commit callback to lock the resource
*
* We do not need to do all of this in case we are called to
* validate an event group (assign == NULL)
*/
if (!unsched && assign) {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
if (x86_pmu.commit_scheduling)
x86_pmu.commit_scheduling(cpuc, i, assign[i]);
}
} else {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
/*
* do not put_constraint() on comitted events,
* because they are good to go
*/
if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
continue;
/*
* release events that failed scheduling
*/
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(cpuc, e);
}
}
if (x86_pmu.stop_scheduling)
x86_pmu.stop_scheduling(cpuc);
return unsched ? -EINVAL : 0;
}
/*
* dogrp: true if must collect siblings events (group)
* returns total number of events and error code
*/
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
struct perf_event *event;
int n, max_count;
max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
/* current number of events already accepted */
n = cpuc->n_events;
if (is_x86_event(leader)) {
if (n >= max_count)
return -EINVAL;
cpuc->event_list[n] = leader;
n++;
}
if (!dogrp)
return n;
for_each_sibling_event(event, leader) {
if (!is_x86_event(event) ||
event->state <= PERF_EVENT_STATE_OFF)
continue;
if (n >= max_count)
return -EINVAL;
cpuc->event_list[n] = event;
n++;
}
return n;
}
static inline void x86_assign_hw_event(struct perf_event *event,
struct cpu_hw_events *cpuc, int i)
{
struct hw_perf_event *hwc = &event->hw;
hwc->idx = cpuc->assign[i];
hwc->last_cpu = smp_processor_id();
hwc->last_tag = ++cpuc->tags[i];
if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
hwc->config_base = 0;
hwc->event_base = 0;
} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
} else {
hwc->config_base = x86_pmu_config_addr(hwc->idx);
hwc->event_base = x86_pmu_event_addr(hwc->idx);
hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
}
}
/**
* x86_perf_rdpmc_index - Return PMC counter used for event
* @event: the perf_event to which the PMC counter was assigned
*
* The counter assigned to this performance event may change if interrupts
* are enabled. This counter should thus never be used while interrupts are
* enabled. Before this function is used to obtain the assigned counter the
* event should be checked for validity using, for example,
* perf_event_read_local(), within the same interrupt disabled section in
* which this counter is planned to be used.
*
* Return: The index of the performance monitoring counter assigned to
* @perf_event.
*/
int x86_perf_rdpmc_index(struct perf_event *event)
{
lockdep_assert_irqs_disabled();
return event->hw.event_base_rdpmc;
}
static inline int match_prev_assignment(struct hw_perf_event *hwc,
struct cpu_hw_events *cpuc,
int i)
{
return hwc->idx == cpuc->assign[i] &&
hwc->last_cpu == smp_processor_id() &&
hwc->last_tag == cpuc->tags[i];
}
static void x86_pmu_start(struct perf_event *event, int flags);
static void x86_pmu_enable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *event;
struct hw_perf_event *hwc;
int i, added = cpuc->n_added;
if (!x86_pmu_initialized())
return;
if (cpuc->enabled)
return;
if (cpuc->n_added) {
int n_running = cpuc->n_events - cpuc->n_added;
/*
* apply assignment obtained either from
* hw_perf_group_sched_in() or x86_pmu_enable()
*
* step1: save events moving to new counters
*/
for (i = 0; i < n_running; i++) {
event = cpuc->event_list[i];
hwc = &event->hw;
/*
* we can avoid reprogramming counter if:
* - assigned same counter as last time
* - running on same CPU as last time
* - no other event has used the counter since
*/
if (hwc->idx == -1 ||
match_prev_assignment(hwc, cpuc, i))
continue;
/*
* Ensure we don't accidentally enable a stopped
* counter simply because we rescheduled.
*/
if (hwc->state & PERF_HES_STOPPED)
hwc->state |= PERF_HES_ARCH;
x86_pmu_stop(event, PERF_EF_UPDATE);
}
/*
* step2: reprogram moved events into new counters
*/
for (i = 0; i < cpuc->n_events; i++) {
event = cpuc->event_list[i];
hwc = &event->hw;
if (!match_prev_assignment(hwc, cpuc, i))
x86_assign_hw_event(event, cpuc, i);
else if (i < n_running)
continue;
if (hwc->state & PERF_HES_ARCH)
continue;
x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
perf_events_lapic_init();
}
cpuc->enabled = 1;
barrier();
x86_pmu.enable_all(added);
}
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
/*
* Set the next IRQ period, based on the hwc->period_left value.
* To be called with the event disabled in hw:
*/
int x86_perf_event_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
int ret = 0, idx = hwc->idx;
if (idx == INTEL_PMC_IDX_FIXED_BTS)
return 0;
/*
* If we are way outside a reasonable range then just skip forward:
*/
if (unlikely(left <= -period)) {
left = period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
if (unlikely(left <= 0)) {
left += period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
/*
* Quirk: certain CPUs dont like it if just 1 hw_event is left:
*/
if (unlikely(left < 2))
left = 2;
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
if (x86_pmu.limit_period)
left = x86_pmu.limit_period(event, left);
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
* The hw event starts counting from this event offset,
* mark it to be able to extra future deltas:
*/
local64_set(&hwc->prev_count, (u64)-left);
wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
/*
* Due to erratum on certan cpu we need
* a second write to be sure the register
* is updated properly
*/
if (x86_pmu.perfctr_second_write) {
wrmsrl(hwc->event_base,
(u64)(-left) & x86_pmu.cntval_mask);
}
perf_event_update_userpage(event);
return ret;
}
void x86_pmu_enable_event(struct perf_event *event)
{
if (__this_cpu_read(cpu_hw_events.enabled))
__x86_pmu_enable_event(&event->hw,
ARCH_PERFMON_EVENTSEL_ENABLE);
}
/*
* Add a single event to the PMU.
*
* The event is added to the group of enabled events
* but only if it can be scehduled with existing events.
*/
static int x86_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc;
int assign[X86_PMC_IDX_MAX];
int n, n0, ret;
hwc = &event->hw;
n0 = cpuc->n_events;
ret = n = collect_events(cpuc, event, false);
if (ret < 0)
goto out;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (!(flags & PERF_EF_START))
hwc->state |= PERF_HES_ARCH;
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be performed
* at commit time (->commit_txn) as a whole.
*
* If commit fails, we'll call ->del() on all events
* for which ->add() was called.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect;
ret = x86_pmu.schedule_events(cpuc, n, assign);
if (ret)
goto out;
/*
* copy new assignment, now we know it is possible
* will be used by hw_perf_enable()
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
done_collect:
/*
* Commit the collect_events() state. See x86_pmu_del() and
* x86_pmu_*_txn().
*/
cpuc->n_events = n;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
if (x86_pmu.add) {
/*
* This is before x86_pmu_enable() will call x86_pmu_start(),
* so we enable LBRs before an event needs them etc..
*/
x86_pmu.add(event);
}
ret = 0;
out:
return ret;
}
static void x86_pmu_start(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx = event->hw.idx;
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
return;
if (WARN_ON_ONCE(idx == -1))
return;
if (flags & PERF_EF_RELOAD) {
WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
x86_perf_event_set_period(event);
}
event->hw.state = 0;
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
__set_bit(idx, cpuc->running);
x86_pmu.enable(event);
perf_event_update_userpage(event);
}
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
u64 pebs, debugctl;
struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
if (!x86_pmu.num_counters)
return;
local_irq_save(flags);
cpu = smp_processor_id();
cpuc = &per_cpu(cpu_hw_events, cpu);
if (x86_pmu.version >= 2) {
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
pr_info("\n");
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
if (x86_pmu.pebs_constraints) {
rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
}
if (x86_pmu.lbr_nr) {
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
}
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
rdmsrl(x86_pmu_event_addr(idx), pmc_count);
prev_left = per_cpu(pmc_prev_left[idx], cpu);
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
cpu, idx, pmc_count);
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
cpu, idx, pmc_count);
}
local_irq_restore(flags);
}
void x86_pmu_stop(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
x86_pmu.disable(event);
cpuc->events[hwc->idx] = NULL;
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
x86_perf_event_update(event);
hwc->state |= PERF_HES_UPTODATE;
}
}
static void x86_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int i;
/*
* event is descheduled
*/
event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
/*
* If we're called during a txn, we only need to undo x86_pmu.add.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
*
* XXX assumes any ->del() called during a TXN will only be on
* an event added during that same TXN.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto do_del;
/*
* Not a TXN, therefore cleanup properly.
*/
x86_pmu_stop(event, PERF_EF_UPDATE);
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event_list[i])
break;
}
if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
return;
/* If we have a newly added event; make sure to decrease n_added. */
if (i >= cpuc->n_events - cpuc->n_added)
--cpuc->n_added;
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(cpuc, event);
/* Delete the array entry. */
while (++i < cpuc->n_events) {
cpuc->event_list[i-1] = cpuc->event_list[i];
cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
}
--cpuc->n_events;
perf_event_update_userpage(event);
do_del:
if (x86_pmu.del) {
/*
* This is after x86_pmu_stop(); so we disable LBRs after any
* event can need them etc..
*/
x86_pmu.del(event);
}
}
int x86_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
struct perf_event *event;
int idx, handled = 0;
u64 val;
cpuc = this_cpu_ptr(&cpu_hw_events);
/*
* Some chipsets need to unmask the LVTPC in a particular spot
* inside the nmi handler. As a result, the unmasking was pushed
* into all the nmi handlers.
*
* This generic handler doesn't seem to have any issues where the
* unmasking occurs so it was left at the top.
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask)) {
/*
* Though we deactivated the counter some cpus
* might still deliver spurious interrupts still
* in flight. Catch them:
*/
if (__test_and_clear_bit(idx, cpuc->running))
handled++;
continue;
}
event = cpuc->events[idx];
val = x86_perf_event_update(event);
if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
continue;
/*
* event overflow
*/
handled++;
perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
if (handled)
inc_irq_stat(apic_perf_irqs);
return handled;
}
void perf_events_lapic_init(void)
{
if (!x86_pmu.apic || !x86_pmu_initialized())
return;
/*
* Always use NMI for PMU
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
static int
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
u64 start_clock;
u64 finish_clock;
int ret;
/*
* All PMUs/events that share this PMI handler should make sure to
* increment active_events for their events.
*/
if (!atomic_read(&active_events))
return NMI_DONE;
start_clock = sched_clock();
ret = x86_pmu.handle_irq(regs);
finish_clock = sched_clock();
perf_sample_event_took(finish_clock - start_clock);
return ret;
}
NOKPROBE_SYMBOL(perf_event_nmi_handler);
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
static int x86_pmu_prepare_cpu(unsigned int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
int i;
for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
cpuc->kfree_on_online[i] = NULL;
if (x86_pmu.cpu_prepare)
return x86_pmu.cpu_prepare(cpu);
return 0;
}
static int x86_pmu_dead_cpu(unsigned int cpu)
{
if (x86_pmu.cpu_dead)
x86_pmu.cpu_dead(cpu);
return 0;
}
static int x86_pmu_online_cpu(unsigned int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
int i;
for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
kfree(cpuc->kfree_on_online[i]);
cpuc->kfree_on_online[i] = NULL;
}
return 0;
}
static int x86_pmu_starting_cpu(unsigned int cpu)
{
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
return 0;
}
static int x86_pmu_dying_cpu(unsigned int cpu)
{
if (x86_pmu.cpu_dying)
x86_pmu.cpu_dying(cpu);
return 0;
}
static void __init pmu_check_apic(void)
{
if (boot_cpu_has(X86_FEATURE_APIC))
return;
x86_pmu.apic = 0;
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
pr_info("no hardware sampling interrupt available.\n");
/*
* If we have a PMU initialized but no APIC
* interrupts, we cannot sample hardware
* events (user-space has to fall back and
* sample via a hrtimer based software event):
*/
pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
}
static struct attribute_group x86_pmu_format_group __ro_after_init = {
.name = "format",
.attrs = NULL,
};
/*
* Remove all undefined events (x86_pmu.event_map(id) == 0)
* out of events_attr attributes.
*/
static void __init filter_events(struct attribute **attrs)
{
struct device_attribute *d;
struct perf_pmu_events_attr *pmu_attr;
int offset = 0;
int i, j;
for (i = 0; attrs[i]; i++) {
d = (struct device_attribute *)attrs[i];
pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
/* str trumps id */
if (pmu_attr->event_str)
continue;
if (x86_pmu.event_map(i + offset))
continue;
for (j = i; attrs[j]; j++)
attrs[j] = attrs[j + 1];
/* Check the shifted attr. */
i--;
/*
* event_map() is index based, the attrs array is organized
* by increasing event index. If we shift the events, then
* we need to compensate for the event_map(), otherwise
* we are looking up the wrong event in the map
*/
offset++;
}
}
/* Merge two pointer arrays */
__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
{
struct attribute **new;
int j, i;
for (j = 0; a && a[j]; j++)
;
for (i = 0; b && b[i]; i++)
j++;
j++;
new = kmalloc_array(j, sizeof(struct attribute *), GFP_KERNEL);
if (!new)
return NULL;
j = 0;
for (i = 0; a && a[i]; i++)
new[j++] = a[i];
for (i = 0; b && b[i]; i++)
new[j++] = b[i];
new[j] = NULL;
return new;
}
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct perf_pmu_events_attr *pmu_attr = \
container_of(attr, struct perf_pmu_events_attr, attr);
u64 config = x86_pmu.event_map(pmu_attr->id);
/* string trumps id */
if (pmu_attr->event_str)
return sprintf(page, "%s", pmu_attr->event_str);
return x86_pmu.events_sysfs_show(page, config);
}
EXPORT_SYMBOL_GPL(events_sysfs_show);
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page)
{
struct perf_pmu_events_ht_attr *pmu_attr =
container_of(attr, struct perf_pmu_events_ht_attr, attr);
/*
* Report conditional events depending on Hyper-Threading.
*
* This is overly conservative as usually the HT special
* handling is not needed if the other CPU thread is idle.
*
* Note this does not (and cannot) handle the case when thread
* siblings are invisible, for example with virtualization
* if they are owned by some other guest. The user tool
* has to re-read when a thread sibling gets onlined later.
*/
return sprintf(page, "%s",
topology_max_smt_threads() > 1 ?
pmu_attr->event_str_ht :
pmu_attr->event_str_noht);
}
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
EVENT_ATTR(instructions, INSTRUCTIONS );
EVENT_ATTR(cache-references, CACHE_REFERENCES );
EVENT_ATTR(cache-misses, CACHE_MISSES );
EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
EVENT_ATTR(branch-misses, BRANCH_MISSES );
EVENT_ATTR(bus-cycles, BUS_CYCLES );
EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
static struct attribute *empty_attrs;
static struct attribute *events_attr[] = {
EVENT_PTR(CPU_CYCLES),
EVENT_PTR(INSTRUCTIONS),
EVENT_PTR(CACHE_REFERENCES),
EVENT_PTR(CACHE_MISSES),
EVENT_PTR(BRANCH_INSTRUCTIONS),
EVENT_PTR(BRANCH_MISSES),
EVENT_PTR(BUS_CYCLES),
EVENT_PTR(STALLED_CYCLES_FRONTEND),
EVENT_PTR(STALLED_CYCLES_BACKEND),
EVENT_PTR(REF_CPU_CYCLES),
NULL,
};
static struct attribute_group x86_pmu_events_group __ro_after_init = {
.name = "events",
.attrs = events_attr,
};
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
{
u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
ssize_t ret;
/*
* We have whole page size to spend and just little data
* to write, so we can safely use sprintf.
*/
ret = sprintf(page, "event=0x%02llx", event);
if (umask)
ret += sprintf(page + ret, ",umask=0x%02llx", umask);
if (edge)
ret += sprintf(page + ret, ",edge");
if (pc)
ret += sprintf(page + ret, ",pc");
if (any)
ret += sprintf(page + ret, ",any");
if (inv)
ret += sprintf(page + ret, ",inv");
if (cmask)
ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
ret += sprintf(page + ret, "\n");
return ret;
}
static struct attribute_group x86_pmu_attr_group;
static struct attribute_group x86_pmu_caps_group;
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
int err;
pr_info("Performance Events: ");
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
err = intel_pmu_init();
break;
case X86_VENDOR_AMD:
err = amd_pmu_init();
break;
case X86_VENDOR_HYGON:
err = amd_pmu_init();
x86_pmu.name = "HYGON";
break;
default:
err = -ENOTSUPP;
}
if (err != 0) {
pr_cont("no PMU driver, software events only.\n");
return 0;
}
pmu_check_apic();
/* sanity check that the hardware exists or is emulated */
if (!check_hw_exists())
return 0;
pr_cont("%s PMU driver.\n", x86_pmu.name);
x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
quirk->func();
if (!x86_pmu.intel_ctrl)
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
perf_events_lapic_init();
register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
0, x86_pmu.num_counters, 0, 0);
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
if (x86_pmu.caps_attrs) {
struct attribute **tmp;
tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs);
if (!WARN_ON(!tmp))
x86_pmu_caps_group.attrs = tmp;
}
if (x86_pmu.event_attrs)
x86_pmu_events_group.attrs = x86_pmu.event_attrs;
if (!x86_pmu.events_sysfs_show)
x86_pmu_events_group.attrs = &empty_attrs;
else
filter_events(x86_pmu_events_group.attrs);
if (x86_pmu.cpu_events) {
struct attribute **tmp;
tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
if (!WARN_ON(!tmp))
x86_pmu_events_group.attrs = tmp;
}
if (x86_pmu.attrs) {
struct attribute **tmp;
tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
if (!WARN_ON(!tmp))
x86_pmu_attr_group.attrs = tmp;
}
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
/*
* Install callbacks. Core will call them for each online
* cpu.
*/
err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
if (err)
return err;
err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
"perf/x86:starting", x86_pmu_starting_cpu,
x86_pmu_dying_cpu);
if (err)
goto out;
err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
x86_pmu_online_cpu, NULL);
if (err)
goto out1;
err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
if (err)
goto out2;
return 0;
out2:
cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
out1:
cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
out:
cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
return err;
}
early_initcall(init_hw_perf_events);
static inline void x86_pmu_read(struct perf_event *event)
{
if (x86_pmu.read)
return x86_pmu.read(event);
x86_perf_event_update(event);
}
/*
* Start group events scheduling transaction
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
*
* We only support PERF_PMU_TXN_ADD transactions. Save the
* transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
* transactions.
*/
static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
cpuc->txn_flags = txn_flags;
if (txn_flags & ~PERF_PMU_TXN_ADD)
return;
perf_pmu_disable(pmu);
__this_cpu_write(cpu_hw_events.n_txn, 0);
}
/*
* Stop group events scheduling transaction
* Clear the flag and pmu::enable() will perform the
* schedulability test.
*/
static void x86_pmu_cancel_txn(struct pmu *pmu)
{
unsigned int txn_flags;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
txn_flags = cpuc->txn_flags;
cpuc->txn_flags = 0;
if (txn_flags & ~PERF_PMU_TXN_ADD)
return;
/*
* Truncate collected array by the number of events added in this
* transaction. See x86_pmu_add() and x86_pmu_*_txn().
*/
__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
perf_pmu_enable(pmu);
}
/*
* Commit group events scheduling transaction
* Perform the group schedulability test as a whole
* Return 0 if success
*
* Does not cancel the transaction on failure; expects the caller to do this.
*/
static int x86_pmu_commit_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int assign[X86_PMC_IDX_MAX];
int n, ret;
WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
cpuc->txn_flags = 0;
return 0;
}
n = cpuc->n_events;
if (!x86_pmu_initialized())
return -EAGAIN;
ret = x86_pmu.schedule_events(cpuc, n, assign);
if (ret)
return ret;
/*
* copy new assignment, now we know it is possible
* will be used by hw_perf_enable()
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
cpuc->txn_flags = 0;
perf_pmu_enable(pmu);
return 0;
}
/*
* a fake_cpuc is used to validate event groups. Due to
* the extra reg logic, we need to also allocate a fake
* per_core and per_cpu structure. Otherwise, group events
* using extra reg may conflict without the kernel being
* able to catch this when the last event gets added to
* the group.
*/
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
kfree(cpuc->shared_regs);
kfree(cpuc);
}
static struct cpu_hw_events *allocate_fake_cpuc(void)
{
struct cpu_hw_events *cpuc;
int cpu = raw_smp_processor_id();
cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
if (!cpuc)
return ERR_PTR(-ENOMEM);
/* only needed, if we have extra_regs */
if (x86_pmu.extra_regs) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
goto error;
}
cpuc->is_fake = 1;
return cpuc;
error:
free_fake_cpuc(cpuc);
return ERR_PTR(-ENOMEM);
}
/*
* validate that we can schedule this event
*/
static int validate_event(struct perf_event *event)
{
struct cpu_hw_events *fake_cpuc;
struct event_constraint *c;
int ret = 0;
fake_cpuc = allocate_fake_cpuc();
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
if (!c || !c->weight)
ret = -EINVAL;
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(fake_cpuc, event);
free_fake_cpuc(fake_cpuc);
return ret;
}
/*
* validate a single event group
*
* validation include:
* - check events are compatible which each other
* - events do not compete for the same counter
* - number of events <= number of counters
*
* validation ensures the group can be loaded onto the
* PMU if it was the only group available.
*/
static int validate_group(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
struct cpu_hw_events *fake_cpuc;
int ret = -EINVAL, n;
fake_cpuc = allocate_fake_cpuc();
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
/*
* the event is not yet connected with its
* siblings therefore we must first collect
* existing siblings, then add the new event
* before we can simulate the scheduling
*/
n = collect_events(fake_cpuc, leader, true);
if (n < 0)
goto out;
fake_cpuc->n_events = n;
n = collect_events(fake_cpuc, event, false);
if (n < 0)
goto out;
fake_cpuc->n_events = n;
ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
out:
free_fake_cpuc(fake_cpuc);
return ret;
}
static int x86_pmu_event_init(struct perf_event *event)
{
struct pmu *tmp;
int err;
switch (event->attr.type) {
case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
break;
default:
return -ENOENT;
}
err = __x86_pmu_event_init(event);
if (!err) {
/*
* we temporarily connect event to its pmu
* such that validate_group() can classify
* it as an x86 event using is_x86_event()
*/
tmp = event->pmu;
event->pmu = &pmu;
if (event->group_leader != event)
err = validate_group(event);
else
err = validate_event(event);
event->pmu = tmp;
}
if (err) {
if (event->destroy)
event->destroy(event);
}
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
return err;
}
static void refresh_pce(void *ignored)
{
load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
}
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;
/*
* This function relies on not being called concurrently in two
* tasks in the same mm. Otherwise one task could observe
* perf_rdpmc_allowed > 1 and return all the way back to
* userspace with CR4.PCE clear while another task is still
* doing on_each_cpu_mask() to propagate CR4.PCE.
*
* For now, this can't happen because all callers hold mmap_sem
* for write. If this changes, we'll need a different solution.
*/
lockdep_assert_held_exclusive(&mm->mmap_sem);
if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
}
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
}
static int x86_pmu_event_idx(struct perf_event *event)
{
int idx = event->hw.idx;
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return 0;
if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
idx -= INTEL_PMC_IDX_FIXED;
idx |= 1 << 30;
}
return idx + 1;
}
static ssize_t get_attr_rdpmc(struct device *cdev,
struct device_attribute *attr,
char *buf)
{
return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
}
static ssize_t set_attr_rdpmc(struct device *cdev,
struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned long val;
ssize_t ret;
ret = kstrtoul(buf, 0, &val);
if (ret)
return ret;
if (val > 2)
return -EINVAL;
if (x86_pmu.attr_rdpmc_broken)
return -ENOTSUPP;
if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
/*
* Changing into or out of always available, aka
* perf-event-bypassing mode. This path is extremely slow,
* but only root can trigger it, so it's okay.
*/
if (val == 2)
static_branch_inc(&rdpmc_always_available_key);
else
static_branch_dec(&rdpmc_always_available_key);
on_each_cpu(refresh_pce, NULL, 1);
}
x86_pmu.attr_rdpmc = val;
return count;
}
static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
static struct attribute *x86_pmu_attrs[] = {
&dev_attr_rdpmc.attr,
NULL,
};
static struct attribute_group x86_pmu_attr_group __ro_after_init = {
.attrs = x86_pmu_attrs,
};
static ssize_t max_precise_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
}
static DEVICE_ATTR_RO(max_precise);
static struct attribute *x86_pmu_caps_attrs[] = {
&dev_attr_max_precise.attr,
NULL
};
static struct attribute_group x86_pmu_caps_group __ro_after_init = {
.name = "caps",
.attrs = x86_pmu_caps_attrs,
};
static const struct attribute_group *x86_pmu_attr_groups[] = {
&x86_pmu_attr_group,
&x86_pmu_format_group,
&x86_pmu_events_group,
&x86_pmu_caps_group,
NULL,
};
static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
if (x86_pmu.sched_task)
x86_pmu.sched_task(ctx, sched_in);
}
void perf_check_microcode(void)
{
if (x86_pmu.check_microcode)
x86_pmu.check_microcode();
}
static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
.pmu_disable = x86_pmu_disable,
.attr_groups = x86_pmu_attr_groups,
.event_init = x86_pmu_event_init,
.event_mapped = x86_pmu_event_mapped,
.event_unmapped = x86_pmu_event_unmapped,
.add = x86_pmu_add,
.del = x86_pmu_del,
.start = x86_pmu_start,
.stop = x86_pmu_stop,
.read = x86_pmu_read,
.start_txn = x86_pmu_start_txn,
.cancel_txn = x86_pmu_cancel_txn,
.commit_txn = x86_pmu_commit_txn,
.event_idx = x86_pmu_event_idx,
.sched_task = x86_pmu_sched_task,
.task_ctx_size = sizeof(struct x86_perf_task_context),
};
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc =
!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!using_native_sched_clock() || !sched_clock_stable())
return;
cyc2ns_read_begin(&data);
offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
userpg->time_mult = data.cyc2ns_mul;
userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
* cap_user_time_zero doesn't make sense when we're using a different
* time base for the records.
*/
if (!event->attr.use_clockid) {
userpg->cap_user_time_zero = 1;
userpg->time_zero = offset;
}
cyc2ns_read_end();
}
void
perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
struct unwind_state state;
unsigned long addr;
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /*covered*/
/* TODO: We don't support guest os callchain now */
return;
}
if (perf_callchain_store(entry, regs->ip)) /*covered*/
return;
for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); /*covered*/
unwind_next_frame(&state)) {
addr = unwind_get_return_address(&state); /*covered*/
if (!addr || perf_callchain_store(entry, addr)) /*covered*/
return;
}
}
static inline int
valid_user_frame(const void __user *fp, unsigned long size)
{
return (__range_not_ok(fp, size, TASK_SIZE) == 0); /*covered*/
}
static unsigned long get_segment_base(unsigned int segment)
{
struct desc_struct *desc;
unsigned int idx = segment >> 3;
if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = READ_ONCE(current->active_mm->context.ldt);
if (!ldt || idx >= ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
#else
return 0;
#endif
} else {
if (idx >= GDT_ENTRIES)
return 0;
desc = raw_cpu_ptr(gdt_page.gdt) + idx;
}
return get_desc_base(desc);
}
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
{
/* 32-bit process in 64-bit kernel. */
unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const void __user *fp;
if (!test_thread_flag(TIF_IA32)) /*covered*/
return 0;
cs_base = get_segment_base(regs->cs);
ss_base = get_segment_base(regs->ss);
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
while (entry->nr < entry->max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
if (!valid_user_frame(fp, sizeof(frame)))
break;
bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
if (bytes != 0)
break;
bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
if (bytes != 0)
break;
perf_callchain_store(entry, cs_base + frame.return_address);
fp = compat_ptr(ss_base + frame.next_frame);
}
pagefault_enable();
return 1;
}
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
{
return 0;
}
#endif
void
perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
struct stack_frame frame;
const unsigned long __user *fp;
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /*covered*/
/* TODO: We don't support guest os callchain now */
return; /*covered*/
}
/*
* We don't know what to do with VM86 stacks.. ignore them for now.
*/
if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) /*covered*/
return;
fp = (unsigned long __user *)regs->bp; /*covered*/
perf_callchain_store(entry, regs->ip); /*covered*/
if (!nmi_uaccess_okay()) /*covered*/
return;
if (perf_callchain_user32(regs, entry)) /*covered*/
return;
pagefault_disable();
while (entry->nr < entry->max_stack) {
unsigned long bytes;
frame.next_frame = NULL; /*covered*/
frame.return_address = 0;
if (!valid_user_frame(fp, sizeof(frame))) /*covered*/
break;
bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp)); /*covered*/
if (bytes != 0)
break;
bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp)); /*covered*/
if (bytes != 0)
break;
perf_callchain_store(entry, frame.return_address); /*covered*/
fp = (void __user *)frame.next_frame;
}
pagefault_enable(); /*covered*/
}
/*
* Deal with code segment offsets for the various execution modes:
*
* VM86 - the good olde 16 bit days, where the linear address is
* 20 bits and we use regs->ip + 0x10 * regs->cs.
*
* IA32 - Where we need to look at GDT/LDT segment descriptor tables
* to figure out what the 32bit base address is.
*
* X32 - has TIF_X32 set, but is running in x86_64
*
* X86_64 - CS,DS,SS,ES are all zero based.
*/
static unsigned long code_segment_base(struct pt_regs *regs)
{
/*
* For IA32 we look at the GDT/LDT segment base to convert the
* effective IP to a linear address.
*/
#ifdef CONFIG_X86_32
/*
* If we are in VM86 mode, add the segment offset to convert to a
* linear address.
*/
if (regs->flags & X86_VM_MASK)
return 0x10 * regs->cs;
if (user_mode(regs) && regs->cs != __USER_CS)
return get_segment_base(regs->cs);
#else
if (user_mode(regs) && !user_64bit_mode(regs) &&
regs->cs != __USER32_CS)
return get_segment_base(regs->cs); /*covered*/
#endif
return 0;
}
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) /*covered*/
return perf_guest_cbs->get_guest_ip();
return regs->ip + code_segment_base(regs); /*covered*/
}
unsigned long perf_misc_flags(struct pt_regs *regs)
{
int misc = 0;
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /*covered*/
if (perf_guest_cbs->is_user_mode())
misc |= PERF_RECORD_MISC_GUEST_USER;
else
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
} else {
if (user_mode(regs)) /*covered*/
misc |= PERF_RECORD_MISC_USER;
else
misc |= PERF_RECORD_MISC_KERNEL;
}
if (regs->flags & PERF_EFLAGS_EXACT) /*covered*/
misc |= PERF_RECORD_MISC_EXACT_IP;
return misc; /*covered*/
}
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
cap->version = x86_pmu.version;
cap->num_counters_gp = x86_pmu.num_counters;
cap->num_counters_fixed = x86_pmu.num_counters_fixed;
cap->bit_width_gp = x86_pmu.cntval_bits;
cap->bit_width_fixed = x86_pmu.cntval_bits;
cap->events_mask = (unsigned int)x86_pmu.events_maskl;
cap->events_mask_len = x86_pmu.events_mask_len;
}
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
/*
* Support Intel RAPL energy consumption counters
* Copyright (C) 2013 Google, Inc., Stephane Eranian
*
* Intel RAPL interface is specified in the IA-32 Manual Vol3b
* section 14.7.1 (September 2013)
*
* RAPL provides more controls than just reporting energy consumption
* however here we only expose the 3 energy consumption free running
* counters (pp0, pkg, dram).
*
* Each of those counters increments in a power unit defined by the
* RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
* but it can vary.
*
* Counter to rapl events mappings:
*
* pp0 counter: consumption of all physical cores (power plane 0)
* event: rapl_energy_cores
* perf code: 0x1
*
* pkg counter: consumption of the whole processor package
* event: rapl_energy_pkg
* perf code: 0x2
*
* dram counter: consumption of the dram domain (servers only)
* event: rapl_energy_dram
* perf code: 0x3
*
* gpu counter: consumption of the builtin-gpu domain (client only)
* event: rapl_energy_gpu
* perf code: 0x4
*
* psys counter: consumption of the builtin-psys domain (client only)
* event: rapl_energy_psys
* perf code: 0x5
*
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
* The events only support system-wide mode counting. There is no
* sampling support because it does not make sense and is not
* supported by the RAPL hardware.
*
* Because we want to avoid floating-point operations in the kernel,
* the events are all reported in fixed point arithmetic (32.32).
* Tools must adjust the counts to convert them to Watts using
* the duration of the measurement. Tools may use a function such as
* ldexp(raw_count, -32);
*/
#define pr_fmt(fmt) "RAPL PMU: " fmt
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include "../perf_event.h"
MODULE_LICENSE("GPL");
/*
* RAPL energy status counters
*/
#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */
#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */
#define NR_RAPL_DOMAINS 0x5
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
"pp1-gpu",
"psys",
};
/* Clients have PP0, PKG */
#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
/* Servers have PP0, PKG, RAM */
#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT)
/* Servers have PP0, PKG, RAM, PP1 */
#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
/* SKL clients have PP0, PKG, RAM, PP1, PSYS */
#define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT|\
1<<RAPL_IDX_PSYS_NRG_STAT)
/* Knights Landing has PKG, RAM */
#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT)
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
*/
#define RAPL_EVENT_MASK 0xFFULL
#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
#define RAPL_CNTR_WIDTH 32
#define RAPL_EVENT_ATTR_STR(_name, v, str) \
static struct perf_pmu_events_attr event_attr_##v = { \
.attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
.id = 0, \
.event_str = str, \
};
struct rapl_pmu {
raw_spinlock_t lock;
int n_active;
int cpu;
struct list_head active_list;
struct pmu *pmu;
ktime_t timer_interval;
struct hrtimer hrtimer;
};
struct rapl_pmus {
struct pmu pmu;
unsigned int maxpkg;
struct rapl_pmu *pmus[];
};
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
static cpumask_t rapl_cpu_mask;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
unsigned int pkgid = topology_logical_package_id(cpu);
/*
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
*/
return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; /*covered*/
}
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
rdmsrl(event->hw.event_base, raw); /*covered*/
return raw;
}
static inline u64 rapl_scale(u64 v, int cfg)
{
if (cfg > NR_RAPL_DOMAINS) {
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v;
}
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
return v << (32 - rapl_hw_unit[cfg - 1]); /*covered*/
}
static u64 rapl_event_update(struct perf_event *event)
{ /*covered*/
struct hw_perf_event *hwc = &event->hw;
u64 prev_raw_count, new_raw_count;
s64 delta, sdelta;
int shift = RAPL_CNTR_WIDTH;
again:
prev_raw_count = local64_read(&hwc->prev_count); /*covered*/
rdmsrl(event->hw.event_base, new_raw_count); /*covered*/
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count) {
cpu_relax();
goto again;
}
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
* (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift); /*covered*/
delta >>= shift;
sdelta = rapl_scale(delta, event->hw.config); /*covered*/
local64_add(sdelta, &event->count);
return new_raw_count;
}
static void rapl_start_hrtimer(struct rapl_pmu *pmu)
{
hrtimer_start(&pmu->hrtimer, pmu->timer_interval, /*covered*/
HRTIMER_MODE_REL_PINNED);
}
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event;
unsigned long flags;
if (!pmu->n_active)
return HRTIMER_NORESTART;
raw_spin_lock_irqsave(&pmu->lock, flags);
list_for_each_entry(event, &pmu->active_list, active_entry)
rapl_event_update(event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval);
return HRTIMER_RESTART;
}
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
{
struct hrtimer *hr = &pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) /*covered*/
return;
event->hw.state = 0; /*covered*/
list_add_tail(&event->active_entry, &pmu->active_list); /*covered*/
local64_set(&event->hw.prev_count, rapl_read_counter(event)); /*covered*/
pmu->n_active++;
if (pmu->n_active == 1) /*covered*/
rapl_start_hrtimer(pmu); /*covered*/
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
__rapl_pmu_event_start(pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private; /*covered*/
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0); /*covered*/
pmu->n_active--; /*covered*/
if (pmu->n_active == 0)
hrtimer_cancel(&pmu->hrtimer); /*covered*/
list_del(&event->active_entry); /*covered*/
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED; /*covered*/
}
/* check if update of sw counter is necessary */
if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /*covered*/
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
rapl_event_update(event); /*covered*/
hwc->state |= PERF_HES_UPTODATE;
}
raw_spin_unlock_irqrestore(&pmu->lock, flags); /*covered*/
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private; /*covered*/
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event); /*covered*/
raw_spin_unlock_irqrestore(&pmu->lock, flags); /*covered*/
return 0;
}
static void rapl_pmu_event_del(struct perf_event *event, int flags)
{
rapl_pmu_event_stop(event, PERF_EF_UPDATE); /*covered*/
}
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK; /*covered*/
int bit, msr, ret = 0;
struct rapl_pmu *pmu;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
return -ENOENT;
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK) /*covered*/
return -EINVAL;
if (event->cpu < 0) /*covered*/
return -EINVAL;
event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; /*covered*/
/*
* check event is known (determines counter)
*/
switch (cfg) { /*covered*/
case INTEL_RAPL_PP0:
bit = RAPL_IDX_PP0_NRG_STAT;
msr = MSR_PP0_ENERGY_STATUS;
break;
case INTEL_RAPL_PKG:
bit = RAPL_IDX_PKG_NRG_STAT;
msr = MSR_PKG_ENERGY_STATUS;
break;
case INTEL_RAPL_RAM:
bit = RAPL_IDX_RAM_NRG_STAT;
msr = MSR_DRAM_ENERGY_STATUS;
break;
case INTEL_RAPL_PP1:
bit = RAPL_IDX_PP1_NRG_STAT;
msr = MSR_PP1_ENERGY_STATUS;
break;
case INTEL_RAPL_PSYS:
bit = RAPL_IDX_PSYS_NRG_STAT;
msr = MSR_PLATFORM_ENERGY_STATUS;
break;
default:
return -EINVAL;
}
/* check event supported */
if (!(rapl_cntr_mask & (1 << bit)))
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
event->attr.exclude_hv ||
event->attr.exclude_idle ||
event->attr.exclude_host || /*covered*/
event->attr.exclude_guest ||
event->attr.sample_period) /* no sampling */ /*covered*/
return -EINVAL;
/* must be done before validate_group */
pmu = cpu_to_rapl_pmu(event->cpu); /*covered*/
if (!pmu) /*covered*/
return -EINVAL;
event->cpu = pmu->cpu;
event->pmu_private = pmu;
event->hw.event_base = msr;
event->hw.config = cfg;
event->hw.idx = bit;
return ret; /*covered*/
}
static void rapl_pmu_event_read(struct perf_event *event)
{
rapl_event_update(event); /*covered*/
}
static ssize_t rapl_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
static struct attribute *rapl_pmu_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
*/
RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
static struct attribute *rapl_events_srv_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_ram_scale),
NULL,
};
static struct attribute *rapl_events_cln_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_gpu),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_gpu_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_gpu_scale),
NULL,
};
static struct attribute *rapl_events_hsw_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_gpu),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_gpu_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_gpu_scale),
EVENT_PTR(rapl_ram_scale),
NULL,
};
static struct attribute *rapl_events_skl_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_gpu),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_psys),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_gpu_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_psys_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_gpu_scale),
EVENT_PTR(rapl_ram_scale),
EVENT_PTR(rapl_psys_scale),
NULL,
};
static struct attribute *rapl_events_knl_attr[] = {
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_ram_scale),
NULL,
};
static struct attribute_group rapl_pmu_events_group = {
.name = "events",
.attrs = NULL, /* patched at runtime */
};
DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
static struct attribute *rapl_formats_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group rapl_pmu_format_group = {
.name = "format",
.attrs = rapl_formats_attr,
};
static const struct attribute_group *rapl_attr_groups[] = {
&rapl_pmu_attr_group,
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
};
static int rapl_cpu_offline(unsigned int cpu)
{
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
/* Check if exiting cpu is used for collecting rapl events */
if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
return 0;
pmu->cpu = -1;
/* Find a new cpu to collect rapl events */
target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
/* Migrate rapl events to the new target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &rapl_cpu_mask);
pmu->cpu = target;
perf_pmu_migrate_context(pmu->pmu, cpu, target);
}
return 0;
}
static int rapl_cpu_online(unsigned int cpu)
{
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
if (!pmu) {
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
if (!pmu)
return -ENOMEM;
raw_spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
pmu->pmu = &rapl_pmus->pmu;
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
}
/*
* Check if there is an online cpu in the package which collects rapl
* events already.
*/
target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
if (target < nr_cpu_ids)
return 0;
cpumask_set_cpu(cpu, &rapl_cpu_mask);
pmu->cpu = cpu;
return 0;
}
static int rapl_check_hw_unit(bool apply_quirk)
{
u64 msr_rapl_power_unit_bits;
int i;
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
return -1;
for (i = 0; i < NR_RAPL_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See
* "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
*/
if (apply_quirk)
rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
/*
* Calculate the timer rate:
* Use reference of 200W for scaling the timeout to avoid counter
* overflows. 200W = 200 Joules/sec
* Divide interval by 2 to avoid lockstep (2 * 100)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
rapl_timer_ms = 2;
if (rapl_hw_unit[0] < 32) {
rapl_timer_ms = (1000 / (2 * 100));
rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
}
return 0;
}
static void __init rapl_advertise(void)
{
int i;
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
hweight32(rapl_cntr_mask), rapl_timer_ms);
for (i = 0; i < NR_RAPL_DOMAINS; i++) {
if (rapl_cntr_mask & (1 << i)) {
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_domain_names[i], rapl_hw_unit[i]);
}
}
}
static void cleanup_rapl_pmus(void)
{
int i;
for (i = 0; i < rapl_pmus->maxpkg; i++)
kfree(rapl_pmus->pmus[i]);
kfree(rapl_pmus);
}
static int __init init_rapl_pmus(void)
{
int maxpkg = topology_max_packages();
size_t size;
size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
rapl_pmus = kzalloc(size, GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
rapl_pmus->maxpkg = maxpkg;
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
rapl_pmus->pmu.event_init = rapl_pmu_event_init;
rapl_pmus->pmu.add = rapl_pmu_event_add;
rapl_pmus->pmu.del = rapl_pmu_event_del;
rapl_pmus->pmu.start = rapl_pmu_event_start;
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
return 0;
}
#define X86_RAPL_MODEL_MATCH(model, init) \
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
struct intel_rapl_init_fun {
bool apply_quirk;
int cntr_mask;
struct attribute **attrs;
};
static const struct intel_rapl_init_fun snb_rapl_init __initconst = {
.apply_quirk = false,
.cntr_mask = RAPL_IDX_CLN,
.attrs = rapl_events_cln_attr,
};
static const struct intel_rapl_init_fun hsx_rapl_init __initconst = {
.apply_quirk = true,
.cntr_mask = RAPL_IDX_SRV,
.attrs = rapl_events_srv_attr,
};
static const struct intel_rapl_init_fun hsw_rapl_init __initconst = {
.apply_quirk = false,
.cntr_mask = RAPL_IDX_HSW,
.attrs = rapl_events_hsw_attr,
};
static const struct intel_rapl_init_fun snbep_rapl_init __initconst = {
.apply_quirk = false,
.cntr_mask = RAPL_IDX_SRV,
.attrs = rapl_events_srv_attr,
};
static const struct intel_rapl_init_fun knl_rapl_init __initconst = {
.apply_quirk = true,
.cntr_mask = RAPL_IDX_KNL,
.attrs = rapl_events_knl_attr,
};
static const struct intel_rapl_init_fun skl_rapl_init __initconst = {
.apply_quirk = false,
.cntr_mask = RAPL_IDX_SKL_CLN,
.attrs = rapl_events_skl_attr,
};
static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, snb_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match);
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
struct intel_rapl_init_fun *rapl_init;
bool apply_quirk;
int ret;
id = x86_match_cpu(rapl_cpu_match);
if (!id)
return -ENODEV;
rapl_init = (struct intel_rapl_init_fun *)id->driver_data;
apply_quirk = rapl_init->apply_quirk;
rapl_cntr_mask = rapl_init->cntr_mask;
rapl_pmu_events_group.attrs = rapl_init->attrs;
ret = rapl_check_hw_unit(apply_quirk);
if (ret)
return ret;
ret = init_rapl_pmus();
if (ret)
return ret;
/*
* Install callbacks. Core will call them for each online cpu.
*/
ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
"perf/x86/rapl:online",
rapl_cpu_online, rapl_cpu_offline);
if (ret)
goto out;
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
if (ret)
goto out1;
rapl_advertise();
return 0;
out1:
cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus();
return ret;
}
module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
perf_pmu_unregister(&rapl_pmus->pmu);
cleanup_rapl_pmus();
}
module_exit(intel_rapl_exit);
// SPDX-License-Identifier: GPL-2.0
#include <linux/perf_event.h>
#include <linux/nospec.h>
#include <asm/intel-family.h>
enum perf_msr_id {
PERF_MSR_TSC = 0,
PERF_MSR_APERF = 1,
PERF_MSR_MPERF = 2,
PERF_MSR_PPERF = 3,
PERF_MSR_SMI = 4,
PERF_MSR_PTSC = 5,
PERF_MSR_IRPERF = 6,
PERF_MSR_THERM = 7,
PERF_MSR_THERM_SNAP = 8,
PERF_MSR_THERM_UNIT = 9,
PERF_MSR_EVENT_MAX,
};
static bool test_aperfmperf(int idx)
{
return boot_cpu_has(X86_FEATURE_APERFMPERF);
}
static bool test_ptsc(int idx)
{
return boot_cpu_has(X86_FEATURE_PTSC);
}
static bool test_irperf(int idx)
{
return boot_cpu_has(X86_FEATURE_IRPERF);
}
static bool test_therm_status(int idx)
{
return boot_cpu_has(X86_FEATURE_DTHERM);
}
static bool test_intel(int idx)
{
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
boot_cpu_data.x86 != 6)
return false;
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_NEHALEM:
case INTEL_FAM6_NEHALEM_G:
case INTEL_FAM6_NEHALEM_EP:
case INTEL_FAM6_NEHALEM_EX:
case INTEL_FAM6_WESTMERE:
case INTEL_FAM6_WESTMERE_EP:
case INTEL_FAM6_WESTMERE_EX:
case INTEL_FAM6_SANDYBRIDGE:
case INTEL_FAM6_SANDYBRIDGE_X:
case INTEL_FAM6_IVYBRIDGE:
case INTEL_FAM6_IVYBRIDGE_X:
case INTEL_FAM6_HASWELL_CORE:
case INTEL_FAM6_HASWELL_X:
case INTEL_FAM6_HASWELL_ULT:
case INTEL_FAM6_HASWELL_GT3E:
case INTEL_FAM6_BROADWELL_CORE:
case INTEL_FAM6_BROADWELL_XEON_D:
case INTEL_FAM6_BROADWELL_GT3E:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_ATOM_SILVERMONT:
case INTEL_FAM6_ATOM_SILVERMONT_X:
case INTEL_FAM6_ATOM_AIRMONT:
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GOLDMONT_X:
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
if (idx == PERF_MSR_SMI)
return true;
break;
case INTEL_FAM6_SKYLAKE_MOBILE:
case INTEL_FAM6_SKYLAKE_DESKTOP:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_KABYLAKE_MOBILE:
case INTEL_FAM6_KABYLAKE_DESKTOP:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
}
return false;
}
struct perf_msr {
u64 msr;
struct perf_pmu_events_attr *attr;
bool (*test)(int idx);
};
PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00" );
PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01" );
PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02" );
PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03" );
PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04" );
PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05" );
PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06" );
PMU_EVENT_ATTR_STRING(cpu_thermal_margin, evattr_therm, "event=0x07" );
PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, evattr_therm_snap, "1" );
PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, evattr_therm_unit, "C" );
static struct perf_msr msr[] = {
[PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
[PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
[PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
[PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
[PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
[PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, },
[PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, },
[PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &evattr_therm, test_therm_status, },
[PERF_MSR_THERM_SNAP] = { MSR_IA32_THERM_STATUS, &evattr_therm_snap, test_therm_status, },
[PERF_MSR_THERM_UNIT] = { MSR_IA32_THERM_STATUS, &evattr_therm_unit, test_therm_status, },
};
static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
NULL,
};
static struct attribute_group events_attr_group = {
.name = "events",
.attrs = events_attrs,
};
PMU_FORMAT_ATTR(event, "config:0-63");
static struct attribute *format_attrs[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group format_attr_group = {
.name = "format",
.attrs = format_attrs,
};
static const struct attribute_group *attr_groups[] = {
&events_attr_group,
&format_attr_group,
NULL,
};
static int msr_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config; /*covered*/
if (event->attr.type != event->pmu->type)
return -ENOENT;
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
event->attr.exclude_hv ||
event->attr.exclude_idle ||
event->attr.exclude_host || /*covered*/
event->attr.exclude_guest ||
event->attr.sample_period) /* no sampling */ /*covered*/
return -EINVAL;
if (cfg >= PERF_MSR_EVENT_MAX) /*covered*/
return -EINVAL;
cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX); /*covered*/
if (!msr[cfg].attr)
return -EINVAL;
event->hw.idx = -1; /*covered*/
event->hw.event_base = msr[cfg].msr;
event->hw.config = cfg;
return 0; /*covered*/
}
static inline u64 msr_read_counter(struct perf_event *event) /*covered*/
{
u64 now;
if (event->hw.event_base)
rdmsrl(event->hw.event_base, now);
else
now = rdtsc_ordered(); /*covered*/
return now;
}
static void msr_event_update(struct perf_event *event)
{
u64 prev, now;
s64 delta;
/* Careful, an NMI might modify the previous event value: */
again:
prev = local64_read(&event->hw.prev_count); /*covered*/
now = msr_read_counter(event); /*covered*/
if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev) /*covered*/
goto again;
delta = now - prev; /*covered*/
if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
delta = sign_extend64(delta, 31);
local64_add(delta, &event->count);
} else if (unlikely(event->hw.event_base == MSR_IA32_THERM_STATUS)) { /*covered*/
/* If valid, extract digital readout, otherwise set to -1: */
now = now & (1ULL << 31) ? (now >> 16) & 0x3f : -1;
local64_set(&event->count, now);
} else {
local64_add(delta, &event->count); /*covered*/
}
} /*covered*/
static void msr_event_start(struct perf_event *event, int flags)
{
u64 now = msr_read_counter(event); /*covered*/
local64_set(&event->hw.prev_count, now); /*covered*/
}
static void msr_event_stop(struct perf_event *event, int flags)
{
msr_event_update(event);
} /*covered*/
static void msr_event_del(struct perf_event *event, int flags)
{
msr_event_stop(event, PERF_EF_UPDATE);
}
static int msr_event_add(struct perf_event *event, int flags)
{
if (flags & PERF_EF_START) /*covered*/
msr_event_start(event, flags); /*covered*/
return 0; /*covered*/
}
static struct pmu pmu_msr = {
.task_ctx_nr = perf_sw_context,
.attr_groups = attr_groups,
.event_init = msr_event_init,
.add = msr_event_add,
.del = msr_event_del,
.start = msr_event_start,
.stop = msr_event_stop,
.read = msr_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
};
static int __init msr_init(void)
{
int i, j = 0;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
pr_cont("no MSR PMU driver.\n");
return 0;
}
/* Probe the MSRs. */
for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
u64 val;
/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
msr[i].attr = NULL;
}
/* List remaining MSRs in the sysfs attrs. */
for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
if (msr[i].attr)
events_attrs[j++] = &msr[i].attr->attr.attr;
}
events_attrs[j] = NULL;
perf_pmu_register(&pmu_msr, "msr", -1);
return 0;
}
device_initcall(msr_init);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_HWEIGHT_H
#define _ASM_X86_HWEIGHT_H
#include <asm/cpufeatures.h>
#ifdef CONFIG_64BIT
#define REG_IN "D"
#define REG_OUT "a"
#else
#define REG_IN "a"
#define REG_OUT "a"
#endif
#define __HAVE_ARCH_SW_HWEIGHT
static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res;
asm (ALTERNATIVE("call __sw_hweight32", "popcntl %1, %0", X86_FEATURE_POPCNT)
: "="REG_OUT (res)
: REG_IN (w));
return res;
}
static inline unsigned int __arch_hweight16(unsigned int w)
{
return __arch_hweight32(w & 0xffff);
}
static inline unsigned int __arch_hweight8(unsigned int w)
{
return __arch_hweight32(w & 0xff);
}
#ifdef CONFIG_X86_32
static inline unsigned long __arch_hweight64(__u64 w)
{
return __arch_hweight32((u32)w) +
__arch_hweight32((u32)(w >> 32));
}
#else
static __always_inline unsigned long __arch_hweight64(__u64 w)
{
unsigned long res;
asm (ALTERNATIVE("call __sw_hweight64", "popcntq %1, %0", X86_FEATURE_POPCNT) /*covered*/
: "="REG_OUT (res)
: REG_IN (w));
return res;
}
#endif /* CONFIG_X86_32 */
#endif
/*
* This file is part of the Linux kernel.
*
* Copyright (c) 2011-2014, Intel Corporation
* Authors: Fenghua Yu <fenghua.yu@intel.com>,
* H. Peter Anvin <hpa@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#ifndef ASM_X86_ARCHRANDOM_H
#define ASM_X86_ARCHRANDOM_H
#include <asm/processor.h>
#include <asm/cpufeature.h>
#define RDRAND_RETRY_LOOPS 10
#define RDRAND_INT ".byte 0x0f,0xc7,0xf0"
#define RDSEED_INT ".byte 0x0f,0xc7,0xf8"
#ifdef CONFIG_X86_64
# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0"
# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8"
#else
# define RDRAND_LONG RDRAND_INT
# define RDSEED_LONG RDSEED_INT
#endif
/* Unconditional execution of RDRAND and RDSEED */
static inline bool rdrand_long(unsigned long *v)
{
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
asm volatile(RDRAND_LONG /*covered*/
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
return true;
} while (--retry);
return false;
}
static inline bool rdrand_int(unsigned int *v)
{
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
asm volatile(RDRAND_INT /*covered*/
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
return true;
} while (--retry);
return false;
}
static inline bool rdseed_long(unsigned long *v)
{
bool ok;
asm volatile(RDSEED_LONG
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;
}
static inline bool rdseed_int(unsigned int *v)
{
bool ok;
asm volatile(RDSEED_INT
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;
}
/* Conditional execution based on CPU type */
#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND)
#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED)
/*
* These are the generic interfaces; they must not be declared if the
* stubs in <linux/random.h> are to be invoked,
* i.e. CONFIG_ARCH_RANDOM is not defined.
*/
#ifdef CONFIG_ARCH_RANDOM
static inline bool arch_get_random_long(unsigned long *v) /*covered*/
{
return arch_has_random() ? rdrand_long(v) : false; /*covered*/
}
static inline bool arch_get_random_int(unsigned int *v) /*covered*/
{
return arch_has_random() ? rdrand_int(v) : false; /*covered*/
}
static inline bool arch_get_random_seed_long(unsigned long *v)
{
return arch_has_random_seed() ? rdseed_long(v) : false;
}
static inline bool arch_get_random_seed_int(unsigned int *v)
{
return arch_has_random_seed() ? rdseed_int(v) : false;
}
extern void x86_init_rdrand(struct cpuinfo_x86 *c);
#else /* !CONFIG_ARCH_RANDOM */
static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { }
#endif /* !CONFIG_ARCH_RANDOM */
#endif /* ASM_X86_ARCHRANDOM_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC_H
#define _ASM_X86_ATOMIC_H
#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>
/*
* Atomic operations that C can't guarantee us. Useful for
* resource counting etc..
*/
#define ATOMIC_INIT(i) { (i) }
/**
* arch_atomic_read - read atomic variable
* @v: pointer of type atomic_t
*
* Atomically reads the value of @v.
*/
static __always_inline int arch_atomic_read(const atomic_t *v)
{
/*
* Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
* it's non-inlined function that increases binary size and stack usage.
*/
return READ_ONCE((v)->counter); /*covered*/
}
/**
* arch_atomic_set - set atomic variable
* @v: pointer of type atomic_t
* @i: required value
*
* Atomically sets the value of @v to @i.
*/
static __always_inline void arch_atomic_set(atomic_t *v, int i)
{
WRITE_ONCE(v->counter, i); /*covered*/
}
/**
* arch_atomic_add - add integer to atomic variable
* @i: integer value to add
* @v: pointer of type atomic_t
*
* Atomically adds @i to @v.
*/
static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "addl %1,%0"
: "+m" (v->counter)
: "ir" (i));
}
/**
* arch_atomic_sub - subtract integer from atomic variable
* @i: integer value to subtract
* @v: pointer of type atomic_t
*
* Atomically subtracts @i from @v.
*/
static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "subl %1,%0"
: "+m" (v->counter)
: "ir" (i));
}
/**
* arch_atomic_sub_and_test - subtract value from variable and test result
* @i: integer value to subtract
* @v: pointer of type atomic_t
*
* Atomically subtracts @i from @v and returns
* true if the result is zero, or false for all
* other cases.
*/
static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test
/**
* arch_atomic_inc - increment atomic variable
* @v: pointer of type atomic_t
*
* Atomically increments @v by 1.
*/
static __always_inline void arch_atomic_inc(atomic_t *v)
{
asm volatile(LOCK_PREFIX "incl %0"
: "+m" (v->counter));
}
#define arch_atomic_inc arch_atomic_inc
/**
* arch_atomic_dec - decrement atomic variable
* @v: pointer of type atomic_t
*
* Atomically decrements @v by 1.
*/
static __always_inline void arch_atomic_dec(atomic_t *v)
{
asm volatile(LOCK_PREFIX "decl %0" /*covered*/
: "+m" (v->counter));
}
#define arch_atomic_dec arch_atomic_dec
/**
* arch_atomic_dec_and_test - decrement and test
* @v: pointer of type atomic_t
*
* Atomically decrements @v by 1 and
* returns true if the result is 0, or false for all other
* cases.
*/
static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test
/**
* arch_atomic_inc_and_test - increment and test
* @v: pointer of type atomic_t
*
* Atomically increments @v by 1
* and returns true if the result is zero, or false for all
* other cases.
*/
static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test
/**
* arch_atomic_add_negative - add and test if negative
* @i: integer value to add
* @v: pointer of type atomic_t
*
* Atomically adds @i to @v and returns true
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
#define arch_atomic_add_negative arch_atomic_add_negative
/**
* arch_atomic_add_return - add integer and return
* @i: integer value to add
* @v: pointer of type atomic_t
*
* Atomically adds @i to @v and returns @i + @v
*/
static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
return i + xadd(&v->counter, i);
}
/**
* arch_atomic_sub_return - subtract integer and return
* @v: pointer of type atomic_t
* @i: integer value to subtract
*
* Atomically subtracts @i from @v and returns @v - @i
*/
static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
{
return arch_atomic_add_return(-i, v);
}
static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
return xadd(&v->counter, i);
}
static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
{
return xadd(&v->counter, -i);
}
static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
return try_cmpxchg(&v->counter, old, new); /*covered*/
}
static inline int arch_atomic_xchg(atomic_t *v, int new)
{
return arch_xchg(&v->counter, new);
}
static inline void arch_atomic_and(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "andl %1,%0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
}
static inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
int val = arch_atomic_read(v);
do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));
return val;
}
static inline void arch_atomic_or(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "orl %1,%0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
}
static inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
int val = arch_atomic_read(v);
do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));
return val;
}
static inline void arch_atomic_xor(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "xorl %1,%0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
}
static inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
int val = arch_atomic_read(v);
do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));
return val;
}
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
# include <asm/atomic64_64.h>
#endif
#include <asm-generic/atomic-instrumented.h>
#endif /* _ASM_X86_ATOMIC_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC64_64_H
#define _ASM_X86_ATOMIC64_64_H
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
/* The 64-bit atomic type */
#define ATOMIC64_INIT(i) { (i) }
/**
* arch_atomic64_read - read atomic64 variable
* @v: pointer of type atomic64_t
*
* Atomically reads the value of @v.
* Doesn't imply a read memory barrier.
*/
static inline long arch_atomic64_read(const atomic64_t *v)
{
return READ_ONCE((v)->counter); /*covered*/
}
/**
* arch_atomic64_set - set atomic64 variable
* @v: pointer to type atomic64_t
* @i: required value
*
* Atomically sets the value of @v to @i.
*/
static inline void arch_atomic64_set(atomic64_t *v, long i)
{
WRITE_ONCE(v->counter, i);
}
/**
* arch_atomic64_add - add integer to atomic64 variable
* @i: integer value to add
* @v: pointer to type atomic64_t
*
* Atomically adds @i to @v.
*/
static __always_inline void arch_atomic64_add(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "addq %1,%0" /*covered*/
: "=m" (v->counter)
: "er" (i), "m" (v->counter));
}
/**
* arch_atomic64_sub - subtract the atomic64 variable
* @i: integer value to subtract
* @v: pointer to type atomic64_t
*
* Atomically subtracts @i from @v.
*/
static inline void arch_atomic64_sub(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "subq %1,%0"
: "=m" (v->counter)
: "er" (i), "m" (v->counter));
}
/**
* arch_atomic64_sub_and_test - subtract value from variable and test result
* @i: integer value to subtract
* @v: pointer to type atomic64_t
*
* Atomically subtracts @i from @v and returns
* true if the result is zero, or false for all
* other cases.
*/
static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
/**
* arch_atomic64_inc - increment atomic64 variable
* @v: pointer to type atomic64_t
*
* Atomically increments @v by 1.
*/
static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "incq %0"
: "=m" (v->counter)
: "m" (v->counter));
}
#define arch_atomic64_inc arch_atomic64_inc
/**
* arch_atomic64_dec - decrement atomic64 variable
* @v: pointer to type atomic64_t
*
* Atomically decrements @v by 1.
*/
static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "decq %0"
: "=m" (v->counter)
: "m" (v->counter));
}
#define arch_atomic64_dec arch_atomic64_dec
/**
* arch_atomic64_dec_and_test - decrement and test
* @v: pointer to type atomic64_t
*
* Atomically decrements @v by 1 and
* returns true if the result is 0, or false for all other
* cases.
*/
static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
/**
* arch_atomic64_inc_and_test - increment and test
* @v: pointer to type atomic64_t
*
* Atomically increments @v by 1
* and returns true if the result is zero, or false for all
* other cases.
*/
static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
/**
* arch_atomic64_add_negative - add and test if negative
* @i: integer value to add
* @v: pointer to type atomic64_t
*
* Atomically adds @i to @v and returns true
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
static inline bool arch_atomic64_add_negative(long i, atomic64_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
#define arch_atomic64_add_negative arch_atomic64_add_negative
/**
* arch_atomic64_add_return - add and return
* @i: integer value to add
* @v: pointer to type atomic64_t
*
* Atomically adds @i to @v and returns @i + @v
*/
static __always_inline long arch_atomic64_add_return(long i, atomic64_t *v)
{
return i + xadd(&v->counter, i);
}
static inline long arch_atomic64_sub_return(long i, atomic64_t *v)
{
return arch_atomic64_add_return(-i, v);
}
static inline long arch_atomic64_fetch_add(long i, atomic64_t *v)
{
return xadd(&v->counter, i);
}
static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v)
{
return xadd(&v->counter, -i);
}
static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new)
{
return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
{
return try_cmpxchg(&v->counter, old, new);
}
static inline long arch_atomic64_xchg(atomic64_t *v, long new)
{
return arch_xchg(&v->counter, new);
}
static inline void arch_atomic64_and(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "andq %1,%0"
: "+m" (v->counter)
: "er" (i)
: "memory");
}
static inline long arch_atomic64_fetch_and(long i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
do {
} while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
return val;
}
static inline void arch_atomic64_or(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "orq %1,%0"
: "+m" (v->counter)
: "er" (i)
: "memory");
}
static inline long arch_atomic64_fetch_or(long i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
do {
} while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
return val;
}
static inline void arch_atomic64_xor(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "xorq %1,%0"
: "+m" (v->counter)
: "er" (i)
: "memory");
}
static inline long arch_atomic64_fetch_xor(long i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
do {
} while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
return val;
}
#endif /* _ASM_X86_ATOMIC64_64_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BARRIER_H
#define _ASM_X86_BARRIER_H
#include <asm/alternative.h>
#include <asm/nops.h>
/*
* Force strict CPU ordering.
* And yes, this might be required on UP too when we're talking
* to devices.
*/
#ifdef CONFIG_X86_32
#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
#else
#define mb() asm volatile("mfence":::"memory")
#define rmb() asm volatile("lfence":::"memory")
#define wmb() asm volatile("sfence" ::: "memory")
#endif
/**
* array_index_mask_nospec() - generate a mask that is ~0UL when the
* bounds check succeeds and 0 otherwise
* @index: array element index
* @size: number of elements in array
*
* Returns:
* 0 - (index < size)
*/
static inline unsigned long array_index_mask_nospec(unsigned long index,
unsigned long size)
{
unsigned long mask;
asm volatile ("cmp %1,%2; sbb %0,%0;" /*covered*/
:"=r" (mask)
:"g"(size),"r" (index)
:"cc");
return mask;
}
/* Override the default implementation from linux/nospec.h. */
#define array_index_mask_nospec array_index_mask_nospec
/* Prevent speculative execution past this barrier. */
#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
"lfence", X86_FEATURE_LFENCE_RDTSC)
#define dma_rmb() barrier()
#define dma_wmb() barrier()
#ifdef CONFIG_X86_32
#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc")
#else
#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc")
#endif
#define __smp_rmb() dma_rmb()
#define __smp_wmb() barrier()
#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
#define __smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \
barrier(); \
WRITE_ONCE(*p, v); \
} while (0)
#define __smp_load_acquire(p) \
({ \
typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
barrier(); \
___p1; \
})
/* Atomic operations are already serializing on x86 */
#define __smp_mb__before_atomic() barrier()
#define __smp_mb__after_atomic() barrier()
#include <asm-generic/barrier.h>
#endif /* _ASM_X86_BARRIER_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BITOPS_H
#define _ASM_X86_BITOPS_H
/*
* Copyright 1992, Linus Torvalds.
*
* Note: inlines with more than a single statement should be marked
* __always_inline to avoid problems with older gcc's inlining heuristics.
*/
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif
#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>
#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
#elif BITS_PER_LONG == 64
# define _BITOPS_LONG_SHIFT 6
#else
# error "Unexpected BITS_PER_LONG"
#endif
#define BIT_64(n) (U64_C(1) << (n))
/*
* These have to be done with inline assembly: that way the bit-setting
* is guaranteed to be atomic. All bit operations return 0 if the bit
* was cleared before the operation and != 0 if it was not.
*
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
*/
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
/* Technically wrong, but this avoids compilation errors on some gcc
versions. */
#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
#else
#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
#endif
#define ADDR BITOP_ADDR(addr)
/*
* We do the locked ops that don't return the old value as
* a mask operation on a byte.
*/
#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr) (1 << ((nr) & 7))
/**
* set_bit - Atomically set a bit in memory
* @nr: the bit to set
* @addr: the address to start counting from
*
* This function is atomic and may not be reordered. See __set_bit()
* if you do not require the atomic guarantees.
*
* Note: there are no guarantees that this function will not be reordered
* on non x86 architectures, so if you are writing portable code,
* make sure not to rely on its reordering guarantees.
*
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
static __always_inline void
set_bit(long nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "orb %1,%0" /*covered*/
: CONST_MASK_ADDR(nr, addr) /*covered*/
: "iq" ((u8)CONST_MASK(nr))
: "memory");
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" /*covered*/
: BITOP_ADDR(addr) : "Ir" (nr) : "memory");
}
}
/**
* __set_bit - Set a bit in memory
* @nr: the bit to set
* @addr: the address to start counting from
*
* Unlike set_bit(), this function is non-atomic and may be reordered.
* If it's called on the same region of memory simultaneously, the effect
* may be that only one operation succeeds.
*/
static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
{
asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory"); /*covered*/
}
/**
* clear_bit - Clears a bit in memory
* @nr: Bit to clear
* @addr: Address to start counting from
*
* clear_bit() is atomic and may not be reordered. However, it does
* not contain a memory barrier, so if it is used for locking purposes,
* you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
* in order to ensure changes are visible on other processors.
*/
static __always_inline void
clear_bit(long nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "andb %1,%0" /*covered*/
: CONST_MASK_ADDR(nr, addr) /*covered*/
: "iq" ((u8)~CONST_MASK(nr)));
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" /*covered*/
: BITOP_ADDR(addr)
: "Ir" (nr));
}
}
/*
* clear_bit_unlock - Clears a bit in memory
* @nr: Bit to clear
* @addr: Address to start counting from
*
* clear_bit() is atomic and implies release semantics before the memory
* operation. It can be used for an unlock.
*/
static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{
barrier();
clear_bit(nr, addr);
}
static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
{
asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr)); /*covered*/
}
static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
{
bool negative;
asm volatile(LOCK_PREFIX "andb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), ADDR
: "ir" ((char) ~(1 << nr)) : "memory");
return negative;
}
// Let everybody know we have it
#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte
/*
* __clear_bit_unlock - Clears a bit in memory
* @nr: Bit to clear
* @addr: Address to start counting from
*
* __clear_bit() is non-atomic and implies release semantics before the memory
* operation. It can be used for an unlock if no other CPUs can concurrently
* modify other bits in the word.
*
* No memory barrier is required here, because x86 cannot reorder stores past
* older loads. Same principle as spin_unlock.
*/
static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
{
barrier(); /*covered*/
__clear_bit(nr, addr);
}
/**
* __change_bit - Toggle a bit in memory
* @nr: the bit to change
* @addr: the address to start counting from
*
* Unlike change_bit(), this function is non-atomic and may be reordered.
* If it's called on the same region of memory simultaneously, the effect
* may be that only one operation succeeds.
*/
static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
{
asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr));
}
/**
* change_bit - Toggle a bit in memory
* @nr: Bit to change
* @addr: Address to start counting from
*
* change_bit() is atomic and may not be reordered.
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
static __always_inline void change_bit(long nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "xorb %1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" ((u8)CONST_MASK(nr)));
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
: BITOP_ADDR(addr)
: "Ir" (nr));
}
}
/**
* test_and_set_bit - Set a bit and return its old value
* @nr: Bit to set
* @addr: Address to count from
*
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); /*covered*/
}
/**
* test_and_set_bit_lock - Set a bit and return its old value for lock
* @nr: Bit to set
* @addr: Address to count from
*
* This is the same as test_and_set_bit on x86.
*/
static __always_inline bool
test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
return test_and_set_bit(nr, addr); /*covered*/
}
/**
* __test_and_set_bit - Set a bit and return its old value
* @nr: Bit to set
* @addr: Address to count from
*
* This operation is non-atomic and can be reordered.
* If two examples of this operation race, one can appear to succeed
* but actually fail. You must protect multiple accesses with a lock.
*/
static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr)
{
bool oldbit;
asm(__ASM_SIZE(bts) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
return oldbit;
}
/**
* test_and_clear_bit - Clear a bit and return its old value
* @nr: Bit to clear
* @addr: Address to count from
*
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); /*covered*/
}
/**
* __test_and_clear_bit - Clear a bit and return its old value
* @nr: Bit to clear
* @addr: Address to count from
*
* This operation is non-atomic and can be reordered.
* If two examples of this operation race, one can appear to succeed
* but actually fail. You must protect multiple accesses with a lock.
*
* Note: the operation is performed atomically with respect to
* the local CPU, but not other CPUs. Portable code should not
* rely on this behaviour.
* KVM relies on this behaviour on x86 for modifying memory that is also
* accessed from a hypervisor on the same CPU if running in a VM: don't change
* this without also updating arch/x86/kernel/kvm.c
*/
static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr)
{
bool oldbit;
asm volatile(__ASM_SIZE(btr) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
return oldbit;
}
/* WARNING: non atomic and it can be reordered! */
static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr)
{
bool oldbit;
asm volatile(__ASM_SIZE(btc) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr) : "memory");
return oldbit;
}
/**
* test_and_change_bit - Change a bit and return its old value
* @nr: Bit to change
* @addr: Address to count from
*
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}
static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0; /*covered*/
}
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
asm volatile(__ASM_SIZE(bt) " %2,%1" /*covered*/
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
return oldbit; /*covered*/
}
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
/**
* test_bit - Determine whether a bit is set
* @nr: bit number to test
* @addr: Address to start counting from
*/
static bool test_bit(int nr, const volatile unsigned long *addr);
#endif
#define test_bit(nr, addr) \
(__builtin_constant_p((nr)) \
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static __always_inline unsigned long __ffs(unsigned long word)
{
asm("rep; bsf %1,%0" /*covered*/
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static __always_inline unsigned long ffz(unsigned long word)
{
asm("rep; bsf %1,%0"
: "=r" (word)
: "r" (~word)); /*covered*/
return word;
}
/*
* __fls: find last set bit in word
* @word: The word to search
*
* Undefined if no set bit exists, so code should check against 0 first.
*/
static __always_inline unsigned long __fls(unsigned long word)
{
asm("bsr %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
#undef ADDR
#ifdef __KERNEL__
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static __always_inline int ffs(int x)
{
int r;
#ifdef CONFIG_X86_64
/*
* AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before, except that the
* top 32 bits will be cleared.
*
* We cannot do this on 32 bits because at the very least some
* 486 CPUs did not behave this way.
*/
asm("bsfl %1,%0"
: "=r" (r)
: "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
asm("bsfl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "r" (-1));
#else
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
#endif
return r + 1;
}
/**
* fls - find last set bit in word
* @x: the word to search
*
* This is defined in a similar way as the libc and compiler builtin
* ffs, but returns the position of the most significant set bit.
*
* fls(value) returns 0 if value is 0 or the position of the last
* set bit if value is nonzero. The last (most significant) bit is
* at position 32.
*/
static __always_inline int fls(unsigned int x)
{
int r;
#ifdef CONFIG_X86_64
/*
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before, except that the
* top 32 bits will be cleared.
*
* We cannot do this on 32 bits because at the very least some
* 486 CPUs did not behave this way.
*/
asm("bsrl %1,%0" /*covered*/
: "=r" (r)
: "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
asm("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
#else
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
#endif
return r + 1;
}
/**
* fls64 - find last set bit in a 64-bit word
* @x: the word to search
*
* This is defined in a similar way as the libc and compiler builtin
* ffsll, but returns the position of the most significant set bit.
*
* fls64(value) returns 0 if value is 0 or the position of the last
* set bit if value is nonzero. The last (most significant) bit is
* at position 64.
*/
#ifdef CONFIG_X86_64
static __always_inline int fls64(__u64 x)
{
int bitpos = -1;
/*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before.
*/
asm("bsrq %1,%q0"
: "+r" (bitpos)
: "rm" (x));
return bitpos + 1;
}
#else
#include <asm-generic/bitops/fls64.h>
#endif
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/sched.h>
#include <asm/arch_hweight.h>
#include <asm-generic/bitops/const_hweight.h>
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/ext2-atomic-setbit.h>
#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CHECKSUM_64_H
#define _ASM_X86_CHECKSUM_64_H
/*
* Checksums for x86-64
* Copyright 2002 by Andi Kleen, SuSE Labs
* with some code from asm-x86/checksum.h
*/
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <asm/byteorder.h>
/**
* csum_fold - Fold and invert a 32bit checksum.
* sum: 32bit unfolded sum
*
* Fold a 32bit running checksum to 16bit and invert it. This is usually
* the last step before putting a checksum into a packet.
* Make sure not to mix with 64bit checksums.
*/
static inline __sum16 csum_fold(__wsum sum)
{
asm(" addl %1,%0\n"
" adcl $0xffff,%0"
: "=r" (sum)
: "r" ((__force u32)sum << 16), /*covered*/
"0" ((__force u32)sum & 0xffff0000));
return (__force __sum16)(~(__force u32)sum >> 16); /*covered*/
}
/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*
* By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
* Arnt Gulbrandsen.
*/
/**
* ip_fast_csum - Compute the IPv4 header checksum efficiently.
* iph: ipv4 header
* ihl: length of header / 4
*/
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
unsigned int sum;
asm(" movl (%1), %0\n"
" subl $4, %2\n"
" jbe 2f\n"
" addl 4(%1), %0\n"
" adcl 8(%1), %0\n"
" adcl 12(%1), %0\n"
"1: adcl 16(%1), %0\n"
" lea 4(%1), %1\n"
" decl %2\n"
" jne 1b\n"
" adcl $0, %0\n"
" movl %0, %2\n"
" shrl $16, %0\n"
" addw %w2, %w0\n"
" adcl $0, %0\n"
" notl %0\n"
"2:"
/* Since the input registers which are loaded with iph and ihl
are modified, we must also specify them as outputs, or gcc
will assume they contain their original values. */
: "=r" (sum), "=r" (iph), "=r" (ihl)
: "1" (iph), "2" (ihl)
: "memory");
return (__force __sum16)sum;
}
/**
* csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
* @saddr: source address
* @daddr: destination address
* @len: length of packet
* @proto: ip protocol of packet
* @sum: initial sum to be added in (32bit unfolded)
*
* Returns the pseudo header checksum the input data. Result is
* 32bit unfolded.
*/
static inline __wsum
csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
__u8 proto, __wsum sum)
{
asm(" addl %1, %0\n"
" adcl %2, %0\n"
" adcl %3, %0\n"
" adcl $0, %0\n"
: "=r" (sum)
: "g" (daddr), "g" (saddr),
"g" ((len + proto)<<8), "0" (sum));
return sum; /*covered*/
}
/**
* csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
* @saddr: source address
* @daddr: destination address
* @len: length of packet
* @proto: ip protocol of packet
* @sum: initial sum to be added in (32bit unfolded)
*
* Returns the 16bit pseudo header checksum the input data already
* complemented and ready to be filled in.
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
__u32 len, __u8 proto,
__wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); /*covered*/
}
/**
* csum_partial - Compute an internet checksum.
* @buff: buffer to be checksummed
* @len: length of buffer.
* @sum: initial sum to be added in (32bit unfolded)
*
* Returns the 32bit unfolded internet checksum of the buffer.
* Before filling it in it needs to be csum_fold()'ed.
* buff should be aligned to a 64bit boundary if possible.
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
#define HAVE_CSUM_COPY_USER 1
/* Do not call this directly. Use the wrappers below */
extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst,
int len, __wsum sum,
int *src_err_ptr, int *dst_err_ptr);
extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
int len, __wsum isum, int *errp);
extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst,
int len, __wsum isum, int *errp);
extern __wsum csum_partial_copy_nocheck(const void *src, void *dst,
int len, __wsum sum);
/* Old names. To be removed. */
#define csum_and_copy_to_user csum_partial_copy_to_user
#define csum_and_copy_from_user csum_partial_copy_from_user
/**
* ip_compute_csum - Compute an 16bit IP checksum.
* @buff: buffer address.
* @len: length of buffer.
*
* Returns the 16bit folded/inverted checksum of the passed buffer.
* Ready to fill in.
*/
extern __sum16 ip_compute_csum(const void *buff, int len);
/**
* csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
* @saddr: source address
* @daddr: destination address
* @len: length of packet
* @proto: protocol of packet
* @sum: initial sum (32bit unfolded) to be added in
*
* Computes an IPv6 pseudo header checksum. This sum is added the checksum
* into UDP/TCP packets and contains some link layer information.
* Returns the unfolded 32bit checksum.
*/
struct in6_addr;
#define _HAVE_ARCH_IPV6_CSUM 1
extern __sum16
csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
__u32 len, __u8 proto, __wsum sum);
static inline unsigned add32_with_carry(unsigned a, unsigned b)
{
asm("addl %2,%0\n\t" /*covered*/
"adcl $0,%0"
: "=r" (a)
: "0" (a), "rm" (b));
return a;
}
#define HAVE_ARCH_CSUM_ADD
static inline __wsum csum_add(__wsum csum, __wsum addend)
{
return (__force __wsum)add32_with_carry((__force unsigned)csum, /*covered*/
(__force unsigned)addend);
}
#endif /* _ASM_X86_CHECKSUM_64_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_COMPAT_H
#define _ASM_X86_COMPAT_H
/*
* Architecture specific compatibility types
*/
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>
#include <asm-generic/compat.h>
#define COMPAT_USER_HZ 100
#define COMPAT_UTS_MACHINE "i686\0\0"
typedef u16 __compat_uid_t;
typedef u16 __compat_gid_t;
typedef u32 __compat_uid32_t;
typedef u32 __compat_gid32_t;
typedef u16 compat_mode_t;
typedef u16 compat_dev_t;
typedef u16 compat_nlink_t;
typedef u16 compat_ipc_pid_t;
typedef u32 compat_caddr_t;
typedef __kernel_fsid_t compat_fsid_t;
typedef s64 __attribute__((aligned(4))) compat_s64;
typedef u64 __attribute__((aligned(4))) compat_u64;
struct compat_stat {
compat_dev_t st_dev;
u16 __pad1;
compat_ino_t st_ino;
compat_mode_t st_mode;
compat_nlink_t st_nlink;
__compat_uid_t st_uid;
__compat_gid_t st_gid;
compat_dev_t st_rdev;
u16 __pad2;
u32 st_size;
u32 st_blksize;
u32 st_blocks;
u32 st_atime;
u32 st_atime_nsec;
u32 st_mtime;
u32 st_mtime_nsec;
u32 st_ctime;
u32 st_ctime_nsec;
u32 __unused4;
u32 __unused5;
};
struct compat_flock {
short l_type;
short l_whence;
compat_off_t l_start;
compat_off_t l_len;
compat_pid_t l_pid;
};
#define F_GETLK64 12 /* using 'struct flock64' */
#define F_SETLK64 13
#define F_SETLKW64 14
/*
* IA32 uses 4 byte alignment for 64 bit quantities,
* so we need to pack this structure.
*/
struct compat_flock64 {
short l_type;
short l_whence;
compat_loff_t l_start;
compat_loff_t l_len;
compat_pid_t l_pid;
} __attribute__((packed));
struct compat_statfs {
int f_type;
int f_bsize;
int f_blocks;
int f_bfree;
int f_bavail;
int f_files;
int f_ffree;
compat_fsid_t f_fsid;
int f_namelen; /* SunOS ignores this field. */
int f_frsize;
int f_flags;
int f_spare[4];
};
#define COMPAT_RLIM_INFINITY 0xffffffff
typedef u32 compat_old_sigset_t; /* at least 32 bits */
#define _COMPAT_NSIG 64
#define _COMPAT_NSIG_BPW 32
typedef u32 compat_sigset_word;
#define COMPAT_OFF_T_MAX 0x7fffffff
struct compat_ipc64_perm {
compat_key_t key;
__compat_uid32_t uid;
__compat_gid32_t gid;
__compat_uid32_t cuid;
__compat_gid32_t cgid;
unsigned short mode;
unsigned short __pad1;
unsigned short seq;
unsigned short __pad2;
compat_ulong_t unused1;
compat_ulong_t unused2;
};
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
compat_ulong_t sem_otime;
compat_ulong_t sem_otime_high;
compat_ulong_t sem_ctime;
compat_ulong_t sem_ctime_high;
compat_ulong_t sem_nsems;
compat_ulong_t __unused3;
compat_ulong_t __unused4;
};
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
compat_ulong_t msg_stime;
compat_ulong_t msg_stime_high;
compat_ulong_t msg_rtime;
compat_ulong_t msg_rtime_high;
compat_ulong_t msg_ctime;
compat_ulong_t msg_ctime_high;
compat_ulong_t msg_cbytes;
compat_ulong_t msg_qnum;
compat_ulong_t msg_qbytes;
compat_pid_t msg_lspid;
compat_pid_t msg_lrpid;
compat_ulong_t __unused4;
compat_ulong_t __unused5;
};
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
compat_size_t shm_segsz;
compat_ulong_t shm_atime;
compat_ulong_t shm_atime_high;
compat_ulong_t shm_dtime;
compat_ulong_t shm_dtime_high;
compat_ulong_t shm_ctime;
compat_ulong_t shm_ctime_high;
compat_pid_t shm_cpid;
compat_pid_t shm_lpid;
compat_ulong_t shm_nattch;
compat_ulong_t __unused4;
compat_ulong_t __unused5;
};
/*
* The type of struct elf_prstatus.pr_reg in compatible core dumps.
*/
typedef struct user_regs_struct compat_elf_gregset_t;
/* Full regset -- prstatus on x32, otherwise on ia32 */
#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
#define SET_PR_FPVALID(S, V, R) \
do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
while (0)
#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif
/*
* A pointer passed in from user mode. This should not
* be used for syscall parameters, just declare them
* as pointers because the syscall entry code will have
* appropriately converted them already.
*/
static inline void __user *compat_ptr(compat_uptr_t uptr)
{
return (void __user *)(unsigned long)uptr;
}
static inline compat_uptr_t ptr_to_compat(void __user *uptr)
{
return (u32)(unsigned long)uptr;
}
static inline void __user *arch_compat_alloc_user_space(long len)
{
compat_uptr_t sp;
if (test_thread_flag(TIF_IA32)) {
sp = task_pt_regs(current)->sp;
} else {
/* -128 for the x32 ABI redzone */
sp = task_pt_regs(current)->sp - 128;
}
return (void __user *)round_down(sp - len, 16);
}
static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) /*covered*/
return true;
#endif
return false;
}
static inline bool in_32bit_syscall(void)
{
return in_ia32_syscall() || in_x32_syscall(); /*covered*/
}
#ifdef CONFIG_COMPAT
static inline bool in_compat_syscall(void)
{
return in_32bit_syscall(); /*covered*/
}
#define in_compat_syscall in_compat_syscall /* override the generic impl */
#endif
struct compat_siginfo;
int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
const kernel_siginfo_t *from, bool x32_ABI);
#endif /* _ASM_X86_COMPAT_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H
#include <asm/processor.h>
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
#include <asm/asm.h>
#include <linux/bitops.h>
enum cpuid_leafs
{
CPUID_1_EDX = 0,
CPUID_8000_0001_EDX,
CPUID_8086_0001_EDX,
CPUID_LNX_1,
CPUID_1_ECX,
CPUID_C000_0001_EDX,
CPUID_8000_0001_ECX,
CPUID_LNX_2,
CPUID_LNX_3,
CPUID_7_0_EBX,
CPUID_D_1_EAX,
CPUID_F_0_EDX,
CPUID_F_1_EDX,
CPUID_8000_0008_EBX,
CPUID_6_EAX,
CPUID_8000_000A_EDX,
CPUID_7_ECX,
CPUID_8000_0007_EBX,
CPUID_7_EDX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
#define X86_CAP_FMT "%s"
#define x86_cap_flag(flag) x86_cap_flags[flag]
#else
#define X86_CAP_FMT "%d:%d"
#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
#endif
/*
* In order to save room, we index into this array by doing
* X86_BUG_<name> - NCAPINTS*32.
*/
extern const char * const x86_bug_flags[NBUGINTS*32];
#define test_cpu_cap(c, bit) \
test_bit(bit, (unsigned long *)((c)->x86_capability))
/*
* There are 32 bits/features in each mask word. The high bits
* (selected with (bit>>5) give us the word number and the low 5
* bits give us the bit/feature number inside the word.
* (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
* see if it is set in the mask word.
*/
#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \
(((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))
#define REQUIRED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
REQUIRED_MASK_CHECK || \
BUILD_BUG_ON_ZERO(NCAPINTS != 19))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
DISABLED_MASK_CHECK || \
BUILD_BUG_ON_ZERO(NCAPINTS != 19))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
test_cpu_cap(c, bit))
#define this_cpu_has(bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
* infrastructure to be used. It may *not* directly test the CPU
* itself. Use the cpu_has() family if you want true runtime
* testing of CPU features, like in hypervisor code where you are
* supporting a possible guest feature where host support for it
* is not relevant.
*/
#define cpu_feature_enabled(bit) \
(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
#define setup_force_cpu_cap(bit) do { \
set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO)
/*
* Workaround for the sake of BPF compilation which utilizes kernel
* headers, but clang does not support ASM GOTO and fails the build.
*/
#ifndef __BPF_TRACING__
#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
#endif
#define static_cpu_has(bit) boot_cpu_has(bit)
#else
/*
* Static testing of CPU features. Used the same as boot_cpu_has().
* These will statically patch the target code for additional
* performance.
*/
static __always_inline __pure bool _static_cpu_has(u16 bit)
{
asm_volatile_goto("1: jmp 6f\n" /*covered*/
"2:\n"
".skip -(((5f-4f) - (2b-1b)) > 0) * "
"((5f-4f) - (2b-1b)),0x90\n"
"3:\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
" .long 4f - .\n" /* repl offset */
" .word %P[always]\n" /* always replace */
" .byte 3b - 1b\n" /* src len */
" .byte 5f - 4f\n" /* repl len */
" .byte 3b - 2b\n" /* pad len */
".previous\n"
".section .altinstr_replacement,\"ax\"\n"
"4: jmp %l[t_no]\n"
"5:\n"
".previous\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
" .long 0\n" /* no replacement */
" .word %P[feature]\n" /* feature bit */
" .byte 3b - 1b\n" /* src len */
" .byte 0\n" /* repl len */
" .byte 0\n" /* pad len */
".previous\n"
".section .altinstr_aux,\"ax\"\n"
"6:\n"
" testb %[bitnum],%[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"
".previous\n"
: : [feature] "i" (bit),
[always] "i" (X86_FEATURE_ALWAYS),
[bitnum] "i" (1 << (bit & 7)),
[cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
: : t_yes, t_no);
t_yes:
return true;
t_no:
return false;
}
#define static_cpu_has(bit) \
( \
__builtin_constant_p(boot_cpu_has(bit)) ? \
boot_cpu_has(bit) : \
_static_cpu_has(bit) \
)
#endif
#define cpu_has_bug(c, bit) cpu_has(c, (bit))
#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
#define static_cpu_has_bug(bit) static_cpu_has((bit))
#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit))
#define MAX_CPU_FEATURES (NCAPINTS * 32)
#define cpu_have_feature boot_cpu_has
#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
boot_cpu_data.x86_model
#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
#endif /* _ASM_X86_CPUFEATURE_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H
#include <linux/compiler.h>
#include <asm/percpu.h>
#ifndef __ASSEMBLY__
struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
static __always_inline struct task_struct *get_current(void)
{
return this_cpu_read_stable(current_task); /*covered*/
}
#define current get_current()
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_CURRENT_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1994 Linus Torvalds
*
* Pentium III FXSR, SSE support
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
* x86-64 work by Andi Kleen 2002
*/
#ifndef _ASM_X86_FPU_INTERNAL_H
#define _ASM_X86_FPU_INTERNAL_H
#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <asm/user.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
#include <asm/cpufeature.h>
#include <asm/trace/fpu.h>
/*
* High level FPU state handling functions:
*/
extern void fpu__initialize(struct fpu *fpu);
extern void fpu__prepare_read(struct fpu *fpu);
extern void fpu__prepare_write(struct fpu *fpu);
extern void fpu__save(struct fpu *fpu);
extern void fpu__restore(struct fpu *fpu);
extern int fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
extern void fpu__clear(struct fpu *fpu);
extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
/*
* Boot time FPU initialization functions:
*/
extern void fpu__init_cpu(void);
extern void fpu__init_system_xstate(void);
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:
*/
#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ (void)(x); 0; })
#endif
/*
* FPU related CPU feature flag helper routines:
*/
static __always_inline __pure bool use_xsaveopt(void)
{
return static_cpu_has(X86_FEATURE_XSAVEOPT);
}
static __always_inline __pure bool use_xsave(void)
{
return static_cpu_has(X86_FEATURE_XSAVE); /*covered*/
}
static __always_inline __pure bool use_fxsr(void)
{
return static_cpu_has(X86_FEATURE_FXSR);
}
/*
* fpstate handling functions:
*/
extern union fpregs_state init_fpstate;
extern void fpstate_init(union fpregs_state *state);
#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif
static inline void fpstate_init_xstate(struct xregs_state *xsave)
{
/*
* XRSTORS requires these bits set in xcomp_bv, or it will
* trigger #GP:
*/
xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
}
static inline void fpstate_init_fxstate(struct fxregs_state *fx)
{
fx->cwd = 0x37f;
fx->mxcsr = MXCSR_DEFAULT;
}
extern void fpstate_sanitize_xstate(struct fpu *fpu);
#define user_insn(insn, output, input...) \
({ \
int err; \
\
might_fault(); \
\
asm volatile(ASM_STAC "\n" \
"1:" #insn "\n\t" \
"2: " ASM_CLAC "\n" \
".section .fixup,\"ax\"\n" \
"3: movl $-1,%[err]\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE(1b, 3b) \
: [err] "=r" (err), output \
: "0"(0), input); \
err; \
})
#define kernel_insn(insn, output, input...) \
asm volatile("1:" #insn "\n\t" \
"2:\n" \
_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \
: output : input)
static inline int copy_fregs_to_user(struct fregs_state __user *fx)
{
return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
}
static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
/* See comment in copy_fxregs_to_kernel() below. */
return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
}
static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
{
if (IS_ENABLED(CONFIG_X86_32)) {
kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
} else {
if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) {
kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
} else {
/* See comment in copy_fxregs_to_kernel() below. */
kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
}
}
}
static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
/* See comment in copy_fxregs_to_kernel() below. */
return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
"m" (*fx));
}
static inline void copy_kernel_to_fregs(struct fregs_state *fx)
{
kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline int copy_user_to_fregs(struct fregs_state __user *fx)
{
return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline void copy_fxregs_to_kernel(struct fpu *fpu)
{
if (IS_ENABLED(CONFIG_X86_32))
asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
else {
/* Using "rex64; fxsave %0" is broken because, if the memory
* operand uses any extended registers for addressing, a second
* REX prefix will be generated (to the assembler, rex64
* followed by semicolon is a separate instruction), and hence
* the 64-bitness is lost.
*
* Using "fxsaveq %0" would be the ideal choice, but is only
* supported starting with gas 2.16.
*
* Using, as a workaround, the properly prefixed form below
* isn't accepted by any binutils version so far released,
* complaining that the same type of prefix is used twice if
* an extended register is needed for addressing (fix submitted
* to mainline 2005-11-21).
*
* asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
*
* This, however, we can work around by forcing the compiler to
* select an addressing mode that doesn't require extended
* registers.
*/
asm volatile( "rex64/fxsave (%[fx])"
: "=m" (fpu->state.fxsave)
: [fx] "R" (&fpu->state.fxsave));
}
}
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
#define XSTATE_OP(op, st, lmask, hmask, err) \
asm volatile("1:" op "\n\t" \
"xor %[err], %[err]\n" \
"2:\n\t" \
".pushsection .fixup,\"ax\"\n\t" \
"3: movl $-2,%[err]\n\t" \
"jmp 2b\n\t" \
".popsection\n\t" \
_ASM_EXTABLE(1b, 3b) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
/*
* If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
* format and supervisor states in addition to modified optimization in
* XSAVEOPT.
*
* Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
* supports modified optimization which is not supported by XSAVE.
*
* We use XSAVE as a fallback.
*
* The 661 label is defined in the ALTERNATIVE* macros as the address of the
* original instruction which gets replaced. We need to use it here as the
* address of the instruction where we might get an exception at.
*/
#define XSTATE_XSAVE(st, lmask, hmask, err) \
asm volatile(ALTERNATIVE_2(XSAVE, \
XSAVEOPT, X86_FEATURE_XSAVEOPT, \
XSAVES, X86_FEATURE_XSAVES) \
"\n" \
"xor %[err], %[err]\n" \
"3:\n" \
".pushsection .fixup,\"ax\"\n" \
"4: movl $-2, %[err]\n" \
"jmp 3b\n" \
".popsection\n" \
_ASM_EXTABLE(661b, 4b) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
/*
* Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
* XSAVE area format.
*/
#define XSTATE_XRESTORE(st, lmask, hmask) \
asm volatile(ALTERNATIVE(XRSTOR, \
XRSTORS, X86_FEATURE_XSAVES) \
"\n" \
"3:\n" \
_ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
: \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
*/
static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
{
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
WARN_ON(system_state != SYSTEM_BOOTING);
if (static_cpu_has(X86_FEATURE_XSAVES))
XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
else
XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
}
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
*/
static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
{
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
WARN_ON(system_state != SYSTEM_BOOTING);
if (static_cpu_has(X86_FEATURE_XSAVES))
XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
else
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
/*
* We should never fault when copying from a kernel buffer, and the FPU
* state we set at boot time should be valid.
*/
WARN_ON_FPU(err);
}
/*
* Save processor xstate to xsave area.
*/
static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
{
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
WARN_ON_FPU(!alternatives_patched);
XSTATE_XSAVE(xstate, lmask, hmask, err); /*covered*/
/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
}
/*
* Restore processor xstate from xsave area.
*/
static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
XSTATE_XRESTORE(xstate, lmask, hmask); /*covered*/
}
/*
* Save xstate to user space xsave area.
*
* We don't use modified optimization because xrstor/xrstors might track
* a different application.
*
* We don't use compacted format xsave area for
* backward compatibility for old applications which don't understand
* compacted format of xsave area.
*/
static inline int copy_xregs_to_user(struct xregs_state __user *buf)
{
int err;
/*
* Clear the xsave header first, so that reserved fields are
* initialized to zero.
*/
err = __clear_user(&buf->header, sizeof(buf->header));
if (unlikely(err))
return -EFAULT;
stac();
XSTATE_OP(XSAVE, buf, -1, -1, err);
clac();
return err;
}
/*
* Restore xstate from user space xsave area.
*/
static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
{
struct xregs_state *xstate = ((__force struct xregs_state *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
stac();
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
clac();
return err;
}
/*
* These must be called with preempt disabled. Returns
* 'true' if the FPU state is still intact and we can
* keep registers active.
*
* The legacy FNSAVE instruction cleared all FPU state
* unconditionally, so registers are essentially destroyed.
* Modern FPU state can be kept in registers, if there are
* no pending FP exceptions.
*/
static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) { /*covered*/
copy_xregs_to_kernel(&fpu->state.xsave); /*covered*/
return 1;
}
if (likely(use_fxsr())) {
copy_fxregs_to_kernel(fpu);
return 1;
}
/*
* Legacy FPU register saving, FNSAVE always clears FPU registers,
* so we have to mark them inactive:
*/
asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
return 0;
}
static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
if (use_xsave()) { /*covered*/
copy_kernel_to_xregs(&fpstate->xsave, mask); /*covered*/
} else {
if (use_fxsr())
copy_kernel_to_fxregs(&fpstate->fxsave);
else
copy_kernel_to_fregs(&fpstate->fsave);
}
}
static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
{
/*
* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
* pending. Clear the x87 state here by setting it to fixed values.
* "m" is a random variable that should be in L1.
*/
if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
asm volatile(
"fnclex\n\t"
"emms\n\t"
"fildl %P[addr]" /* set F?P to defined value */
: : [addr] "m" (fpstate));
}
__copy_kernel_to_fpregs(fpstate, -1); /*covered*/
}
extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
/*
* FPU context switch related helper methods:
*/
DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
/*
* The in-register FPU state for an FPU context on a CPU is assumed to be
* valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
* matches the FPU.
*
* If the FPU register state is valid, the kernel can skip restoring the
* FPU state from memory.
*
* Any code that clobbers the FPU registers or updates the in-memory
* FPU state for a task MUST let the rest of the kernel know that the
* FPU registers are no longer valid for this task.
*
* Either one of these invalidation functions is enough. Invalidate
* a resource you control: CPU if using the CPU for something else
* (with preemption disabled), FPU for the current task, or a task that
* is prevented from running by the current task.
*/
static inline void __cpu_invalidate_fpregs_state(void)
{
__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}
static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
fpu->last_cpu = -1;
}
static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}
/*
* These generally need preemption protection to work,
* do try to avoid using these on their own:
*/
static inline void fpregs_deactivate(struct fpu *fpu)
{
this_cpu_write(fpu_fpregs_owner_ctx, NULL);
trace_x86_fpu_regs_deactivated(fpu);
}
static inline void fpregs_activate(struct fpu *fpu)
{
this_cpu_write(fpu_fpregs_owner_ctx, fpu);
trace_x86_fpu_regs_activated(fpu);
}
/*
* FPU state switching for scheduling.
*
* This is a two-stage process:
*
* - switch_fpu_prepare() saves the old state.
* This is done within the context of the old process.
*
* - switch_fpu_finish() restores the new state as
* necessary.
*/
static inline void
switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) {
if (!copy_fpregs_to_fpstate(old_fpu))
old_fpu->last_cpu = -1;
else
old_fpu->last_cpu = cpu;
/* But leave fpu_fpregs_owner_ctx! */
trace_x86_fpu_regs_deactivated(old_fpu);
} else
old_fpu->last_cpu = -1;
}
/*
* Misc helper functions:
*/
/*
* Set up the userspace FPU context for the new task, if the task
* has used the FPU.
*/
static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
{
bool preload = static_cpu_has(X86_FEATURE_FPU) &&
new_fpu->initialized;
if (preload) {
if (!fpregs_state_valid(new_fpu, cpu))
copy_kernel_to_fpregs(&new_fpu->state);
fpregs_activate(new_fpu);
}
}
/*
* Needs to be preemption-safe.
*
* NOTE! user_fpu_begin() must be used only immediately before restoring
* the save state. It does not do any saving/restoring on its own. In
* lazy FPU mode, it is just an optimization to avoid a #NM exception,
* the task can lose the FPU right after preempt_enable().
*/
static inline void user_fpu_begin(void)
{
struct fpu *fpu = ¤t->thread.fpu;
preempt_disable();
fpregs_activate(fpu);
preempt_enable();
}
/*
* MXCSR and XCR definitions:
*/
extern unsigned int mxcsr_feature_mask;
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
static inline u64 xgetbv(u32 index)
{
u32 eax, edx;
asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
: "=a" (eax), "=d" (edx)
: "c" (index));
return eax + ((u64)edx << 32);
}
static inline void xsetbv(u32 index, u64 value)
{
u32 eax = value;
u32 edx = value >> 32;
asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
: : "a" (eax), "d" (edx), "c" (index));
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Per-cpu current frame pointer - the location of the last exception frame on
* the stack, stored in the per-cpu area.
*
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
#ifndef _ASM_X86_IRQ_REGS_H
#define _ASM_X86_IRQ_REGS_H
#include <asm/percpu.h>
#define ARCH_HAS_OWN_IRQ_REGS
DECLARE_PER_CPU(struct pt_regs *, irq_regs);
static inline struct pt_regs *get_irq_regs(void)
{
return this_cpu_read(irq_regs); /*covered*/
}
static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
{
struct pt_regs *old_regs;
old_regs = get_irq_regs();
this_cpu_write(irq_regs, new_regs);
return old_regs;
}
#endif /* _ASM_X86_IRQ_REGS_32_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_
#include <asm/processor-flags.h>
#ifndef __ASSEMBLY__
/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
#define __cpuidle __attribute__((__section__(".cpuidle.text")))
/*
* Interrupt control:
*/
/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern inline unsigned long native_save_fl(void)
{
unsigned long flags;
/*
* "=rm" is safe here, because "pop" adjusts the stack before
* it evaluates its effective address -- this is part of the
* documented behavior of the "pop" instruction.
*/
asm volatile("# __raw_save_flags\n\t"
"pushf ; pop %0"
: "=rm" (flags)
: /* no input */
: "memory");
return flags;
}
extern inline void native_restore_fl(unsigned long flags);
extern inline void native_restore_fl(unsigned long flags)
{
asm volatile("push %0 ; popf"
: /* no output */
:"g" (flags)
:"memory", "cc");
}
static inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
}
static inline void native_irq_enable(void)
{
asm volatile("sti": : :"memory");
}
static inline __cpuidle void native_safe_halt(void)
{
asm volatile("sti; hlt": : :"memory");
}
static inline __cpuidle void native_halt(void)
{
asm volatile("hlt": : :"memory");
}
#endif
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
#include <linux/types.h>
static inline notrace unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}
static inline notrace void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
static inline notrace void arch_local_irq_disable(void)
{
native_irq_disable();
}
static inline notrace void arch_local_irq_enable(void)
{
native_irq_enable();
}
/*
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
static inline __cpuidle void arch_safe_halt(void)
{
native_safe_halt();
}
/*
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
static inline __cpuidle void halt(void)
{
native_halt();
}
/*
* For spinlocks, etc:
*/
static inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long flags = arch_local_save_flags();
arch_local_irq_disable();
return flags;
}
#else
#define ENABLE_INTERRUPTS(x) sti
#define DISABLE_INTERRUPTS(x) cli
#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
#define SWAPGS swapgs
/*
* Currently paravirt can't handle swapgs nicely when we
* don't have a stack we can rely on (such as a user space
* stack). So we either find a way around these or just fault
* and emulate if a guest tries to call swapgs directly.
*
* Either way, this is a good way to document that we don't
* have a reliable stack. x86_64 only.
*/
#define SWAPGS_UNSAFE_STACK swapgs
#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
#define USERGS_SYSRET32 \
swapgs; \
sysretl
#else
#define INTERRUPT_RETURN iret
#endif
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT_XXL */
#ifndef __ASSEMBLY__
static inline int arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & X86_EFLAGS_IF); /*covered*/
}
static inline int arch_irqs_disabled(void)
{
unsigned long flags = arch_local_save_flags();
return arch_irqs_disabled_flags(flags);
}
#endif /* !__ASSEMBLY__ */
#ifdef __ASSEMBLY__
#ifdef CONFIG_TRACE_IRQFLAGS
# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
#else
# define TRACE_IRQS_ON
# define TRACE_IRQS_OFF
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# ifdef CONFIG_X86_64
# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
# define LOCKDEP_SYS_EXIT_IRQ \
TRACE_IRQS_ON; \
sti; \
call lockdep_sys_exit_thunk; \
cli; \
TRACE_IRQS_OFF;
# else
# define LOCKDEP_SYS_EXIT \
pushl %eax; \
pushl %ecx; \
pushl %edx; \
call lockdep_sys_exit; \
popl %edx; \
popl %ecx; \
popl %eax;
# define LOCKDEP_SYS_EXIT_IRQ
# endif
#else
# define LOCKDEP_SYS_EXIT
# define LOCKDEP_SYS_EXIT_IRQ
#endif
#endif /* __ASSEMBLY__ */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H
#define JUMP_LABEL_NOP_SIZE 5
#ifdef CONFIG_X86_64
# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
#else
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif
#include <asm/asm.h>
#include <asm/nops.h>
#ifndef __ASSEMBLY__
#include <linux/stringify.h>
#include <linux/types.h>
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:" /*covered*/
".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
".pushsection __jump_table, \"aw\" \n\t"
_ASM_ALIGN "\n\t"
".long 1b - ., %l[l_yes] - . \n\t"
_ASM_PTR "%c0 + %c1 - .\n\t"
".popsection \n\t"
: : "i" (key), "i" (branch) : : l_yes);
return false;
l_yes:
return true;
}
static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
{
asm_volatile_goto("1:" /*covered*/
".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
"2:\n\t"
".pushsection __jump_table, \"aw\" \n\t"
_ASM_ALIGN "\n\t"
".long 1b - ., %l[l_yes] - . \n\t"
_ASM_PTR "%c0 + %c1 - .\n\t"
".popsection \n\t"
: : "i" (key), "i" (branch) : : l_yes);
return false;
l_yes:
return true;
}
#else /* __ASSEMBLY__ */
.macro STATIC_JUMP_IF_TRUE target, key, def
.Lstatic_jump_\@:
.if \def
/* Equivalent to "jmp.d32 \target" */
.byte 0xe9
.long \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
.else
.byte STATIC_KEY_INIT_NOP
.endif
.pushsection __jump_table, "aw"
_ASM_ALIGN
.long .Lstatic_jump_\@ - ., \target - .
_ASM_PTR \key - .
.popsection
.endm
.macro STATIC_JUMP_IF_FALSE target, key, def
.Lstatic_jump_\@:
.if \def
.byte STATIC_KEY_INIT_NOP
.else
/* Equivalent to "jmp.d32 \target" */
.byte 0xe9
.long \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
.endif
.pushsection __jump_table, "aw"
_ASM_ALIGN
.long .Lstatic_jump_\@ - ., \target - .
_ASM_PTR \key + 1 - .
.popsection
.endm
#endif /* __ASSEMBLY__ */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_LOCAL_H
#define _ASM_X86_LOCAL_H
#include <linux/percpu.h>
#include <linux/atomic.h>
#include <asm/asm.h>
typedef struct {
atomic_long_t a;
} local_t;
#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
#define local_read(l) atomic_long_read(&(l)->a)
#define local_set(l, i) atomic_long_set(&(l)->a, (i))
static inline void local_inc(local_t *l)
{
asm volatile(_ASM_INC "%0"
: "+m" (l->a.counter));
}
static inline void local_dec(local_t *l)
{
asm volatile(_ASM_DEC "%0"
: "+m" (l->a.counter));
}
static inline void local_add(long i, local_t *l)
{
asm volatile(_ASM_ADD "%1,%0" /*covered*/
: "+m" (l->a.counter)
: "ir" (i));
}
static inline void local_sub(long i, local_t *l)
{
asm volatile(_ASM_SUB "%1,%0"
: "+m" (l->a.counter)
: "ir" (i));
}
/**
* local_sub_and_test - subtract value from variable and test result
* @i: integer value to subtract
* @l: pointer to type local_t
*
* Atomically subtracts @i from @l and returns
* true if the result is zero, or false for all
* other cases.
*/
static inline bool local_sub_and_test(long i, local_t *l)
{
return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i);
}
/**
* local_dec_and_test - decrement and test
* @l: pointer to type local_t
*
* Atomically decrements @l by 1 and
* returns true if the result is 0, or false for all other
* cases.
*/
static inline bool local_dec_and_test(local_t *l)
{
return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e);
}
/**
* local_inc_and_test - increment and test
* @l: pointer to type local_t
*
* Atomically increments @l by 1
* and returns true if the result is zero, or false for all
* other cases.
*/
static inline bool local_inc_and_test(local_t *l)
{
return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e);
}
/**
* local_add_negative - add and test if negative
* @i: integer value to add
* @l: pointer to type local_t
*
* Atomically adds @i to @l and returns true
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
static inline bool local_add_negative(long i, local_t *l)
{
return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i);
}
/**
* local_add_return - add and return
* @i: integer value to add
* @l: pointer to type local_t
*
* Atomically adds @i to @l and returns @i + @l
*/
static inline long local_add_return(long i, local_t *l)
{
long __i = i;
asm volatile(_ASM_XADD "%0, %1;"
: "+r" (i), "+m" (l->a.counter)
: : "memory");
return i + __i;
}
static inline long local_sub_return(long i, local_t *l)
{
return local_add_return(-i, l);
}
#define local_inc_return(l) (local_add_return(1, l))
#define local_dec_return(l) (local_sub_return(1, l))
#define local_cmpxchg(l, o, n) \
(cmpxchg_local(&((l)->a.counter), (o), (n)))
/* Always has a lock prefix */
#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
/**
* local_add_unless - add unless the number is a given value
* @l: pointer of type local_t
* @a: the amount to add to l...
* @u: ...unless l is equal to u.
*
* Atomically adds @a to @l, so long as it was not @u.
* Returns non-zero if @l was not @u, and zero otherwise.
*/
#define local_add_unless(l, a, u) \
({ \
long c, old; \
c = local_read((l)); \
for (;;) { \
if (unlikely(c == (u))) \
break; \
old = local_cmpxchg((l), c, c + (a)); \
if (likely(old == c)) \
break; \
c = old; \
} \
c != (u); \
})
#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
/* On x86_32, these are no better than the atomic variants.
* On x86-64 these are better than the atomic variants on SMP kernels
* because they dont use a lock prefix.
*/
#define __local_inc(l) local_inc(l)
#define __local_dec(l) local_dec(l)
#define __local_add(i, l) local_add((i), (l))
#define __local_sub(i, l) local_sub((i), (l))
#endif /* _ASM_X86_LOCAL_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>
#include <trace/events/tlb.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/mpx.h>
extern atomic64_t last_mm_ctx_id;
#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
}
#endif /* !CONFIG_PARAVIRT_XXL */
#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
static inline void load_mm_cr4(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
atomic_read(&mm->context.perf_rdpmc_allowed))
cr4_set_bits(X86_CR4_PCE);
else
cr4_clear_bits(X86_CR4_PCE);
}
#else
static inline void load_mm_cr4(struct mm_struct *mm) {}
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* ldt_structs can be allocated, used, and freed, but they are never
* modified while live.
*/
struct ldt_struct {
/*
* Xen requires page-aligned LDTs with special permissions. This is
* needed to prevent us from installing evil descriptors such as
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int nr_entries;
/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
};
/* This is a multiple of PAGE_SIZE. */
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
static inline void *ldt_slot_va(int slot)
{
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
}
/*
* Used for LDT copy/destruction.
*/
static inline void init_new_context_ldt(struct mm_struct *mm)
{
mm->context.ldt = NULL;
init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
struct mm_struct *mm)
{
return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
/* READ_ONCE synchronizes with smp_store_release */
ldt = READ_ONCE(mm->context.ldt);
/*
* Any change to mm->context.ldt is followed by an IPI to all
* CPUs with the mm active. The LDT will not be freed until
* after the IPI is handled by all such CPUs. This means that,
* if the ldt_struct changes before we return, the values we see
* will be safe, and the new values will be loaded before we run
* any user code.
*
* NB: don't try to convert this to use RCU without extreme care.
* We would still need IRQs off, because we don't want to change
* the local LDT after an IPI loaded a newer value than the one
* that we can see.
*/
if (unlikely(ldt)) {
if (static_cpu_has(X86_FEATURE_PTI)) {
if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
/*
* Whoops -- either the new LDT isn't mapped
* (if slot == -1) or is mapped into a bogus
* slot (if slot > 1).
*/
clear_LDT();
return;
}
/*
* If page table isolation is enabled, ldt->entries
* will not be mapped in the userspace pagetables.
* Tell the CPU to access the LDT through the alias
* at ldt_slot_va(ldt->slot).
*/
set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
} else {
set_ldt(ldt->entries, ldt->nr_entries);
}
} else {
clear_LDT();
}
#else
clear_LDT();
#endif
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* Load the LDT if either the old or new mm had an LDT.
*
* An mm will never go from having an LDT to not having an LDT. Two
* mms never share an LDT, so we don't gain anything by checking to
* see whether the LDT changed. There's also no guarantee that
* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
* then prev->context.ldt will also be non-NULL.
*
* If we really cared, we could optimize the case where prev == next
* and we're exiting lazy mode. Most of the time, if this happens,
* we don't actually need to reload LDTR, but modify_ldt() is mostly
* used by legacy code and emulators where we don't need this level of
* performance.
*
* This uses | instead of || because it generates better code.
*/
if (unlikely((unsigned long)prev->context.ldt |
(unsigned long)next->context.ldt))
load_mm_ldt(next);
#endif
DEBUG_LOCKS_WARN_ON(preemptible());
}
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
*/
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
mutex_init(&mm->context.lock);
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and allocated implicitly */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
#endif
init_new_context_ldt(mm);
return 0;
}
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
do { \
paravirt_activate_mm((prev), (next)); \
switch_mm((prev), (next), NULL); \
} while (0);
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
lazy_load_gs(0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
do { \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
#endif
static inline void arch_dup_pkeys(struct mm_struct *oldmm,
struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return;
/* Duplicate the oldmm pkey state in mm: */
mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
#endif
}
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
arch_dup_pkeys(oldmm, mm);
paravirt_arch_dup_mmap(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
!(mm->context.ia32_compat == TIF_IA32);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return false;
}
#endif
static inline void arch_bprm_mm_init(struct mm_struct *mm,
struct vm_area_struct *vma)
{
mpx_mm_init(mm);
}
static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
/*
* mpx_notify_unmap() goes and reads a rarely-hot
* cacheline in the mm_struct. That can be expensive
* enough to be seen in profiles.
*
* The mpx_notify_unmap() call and its contents have been
* observed to affect munmap() performance on hardware
* where MPX is not present.
*
* The unlikely() optimizes for the fast case: no MPX
* in the CPU, or no MPX use in the process. Even if
* we get this wrong (in the unlikely event that MPX
* is widely enabled on some system) the overhead of
* MPX itself (reading bounds tables) is expected to
* overwhelm the overhead of getting this unlikely()
* consistently wrong.
*/
if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
mpx_notify_unmap(mm, vma, start, end);
}
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
* processes or any way to tell *which * PKRU in a threaded
* process we could use.
*
* So do not enforce things if the VMA is not from the current
* mm, or if we are in a kernel thread.
*/
static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
if (!current->mm) /*covered*/
return true;
/*
* Should PKRU be enforced on the access to this VMA? If
* the VMA is from another process, then PKRU has no
* relevance and should not be enforced.
*/
if (current->mm != vma->vm_mm) /*covered*/
return true;
return false;
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /*covered*/
bool write, bool execute, bool foreign)
{
/* pkeys never affect instruction fetches */
if (execute) /*covered*/
return true;
/* allow access if the VMA is not one from this process */
if (foreign || vma_is_foreign(vma)) /*covered*/
return true;
return __pkru_allows_pkey(vma_pkey(vma), write); /*covered*/
}
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
* It's intended to be used for code like KVM that sneakily changes CR3
* and needs to restore it. It needs to be used very carefully.
*/
static inline unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
VM_BUG_ON(cr3 != __read_cr3());
return cr3;
}
#endif /* _ASM_X86_MMU_CONTEXT_H */
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM msr
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE msr-trace
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/
#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MSR_H
#include <linux/tracepoint.h>
/*
* Tracing for x86 model specific registers. Directly maps to the
* RDMSR/WRMSR instructions.
*/
DECLARE_EVENT_CLASS(msr_trace_class,
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed),
TP_STRUCT__entry(
__field( unsigned, msr )
__field( u64, val )
__field( int, failed )
),
TP_fast_assign(
__entry->msr = msr;
__entry->val = val;
__entry->failed = failed;
),
TP_printk("%x, value %llx%s",
__entry->msr,
__entry->val,
__entry->failed ? " #GP" : "")
);
DEFINE_EVENT(msr_trace_class, read_msr,
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed)
);
DEFINE_EVENT(msr_trace_class, write_msr, /*covered*/
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed)
);
DEFINE_EVENT(msr_trace_class, rdpmc,
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed)
);
#endif /* _TRACE_MSR_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM msr
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE msr-trace
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/
#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MSR_H
#include <linux/tracepoint.h>
/*
* Tracing for x86 model specific registers. Directly maps to the
* RDMSR/WRMSR instructions.
*/
DECLARE_EVENT_CLASS(msr_trace_class, /*covered*/
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed),
TP_STRUCT__entry(
__field( unsigned, msr )
__field( u64, val )
__field( int, failed )
),
TP_fast_assign(
__entry->msr = msr;
__entry->val = val;
__entry->failed = failed;
),
TP_printk("%x, value %llx%s",
__entry->msr,
__entry->val,
__entry->failed ? " #GP" : "")
);
DEFINE_EVENT(msr_trace_class, read_msr,
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed)
);
DEFINE_EVENT(msr_trace_class, write_msr,
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed)
);
DEFINE_EVENT(msr_trace_class, rdpmc,
TP_PROTO(unsigned msr, u64 val, int failed),
TP_ARGS(msr, val, failed)
);
#endif /* _TRACE_MSR_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H
#include "msr-index.h"
#ifndef __ASSEMBLY__
#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>
struct msr {
union {
struct {
u32 l;
u32 h;
};
u64 q;
};
};
struct msr_info {
u32 msr_no;
struct msr reg;
struct msr *msrs;
int err;
};
struct msr_regs_info {
u32 *regs;
int err;
};
struct saved_msr {
bool valid;
struct msr_info info;
};
struct saved_msrs {
unsigned int num;
struct saved_msr *array;
};
/*
* both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
* constraint has different meanings. For i386, "A" means exactly
* edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
* it means rax *or* rdx.
*/
#ifdef CONFIG_X86_64
/* Using 64-bit values saves one instruction clearing the high half of low */
#define DECLARE_ARGS(val, low, high) unsigned long low, high
#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32)
#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
#else
#define DECLARE_ARGS(val, low, high) unsigned long long val
#define EAX_EDX_VAL(val, low, high) (val)
#define EAX_EDX_RET(val, low, high) "=A" (val)
#endif
#ifdef CONFIG_TRACEPOINTS
/*
* Be very careful with includes. This header is prone to include loops.
*/
#include <asm/atomic.h>
#include <linux/tracepoint-defs.h>
extern struct tracepoint __tracepoint_read_msr;
extern struct tracepoint __tracepoint_write_msr;
extern struct tracepoint __tracepoint_rdpmc;
#define msr_tracepoint_active(t) static_key_false(&(t).key)
extern void do_trace_write_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_read_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed);
#else
#define msr_tracepoint_active(t) false
static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
#endif
/*
* __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
* accessors and should not have any tracing or other functionality piggybacking
* on them - those are *purely* for accessing MSRs and nothing more. So don't even
* think of extending them - you will be slapped with a stinking trout or a frozen
* shark will reach you, wherever you are! You've been warned.
*/
static inline unsigned long long notrace __rdmsr(unsigned int msr)
{
DECLARE_ARGS(val, low, high);
asm volatile("1: rdmsr\n"
"2:\n"
_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
: EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
}
static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high)
{
asm volatile("1: wrmsr\n"
"2:\n"
_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
#define native_rdmsr(msr, val1, val2) \
do { \
u64 __val = __rdmsr((msr)); \
(void)((val1) = (u32)__val); \
(void)((val2) = (u32)(__val >> 32)); \
} while (0)
#define native_wrmsr(msr, low, high) \
__wrmsr(msr, low, high)
#define native_wrmsrl(msr, val) \
__wrmsr((msr), (u32)((u64)(val)), \
(u32)((u64)(val) >> 32))
static inline unsigned long long native_read_msr(unsigned int msr)
{
unsigned long long val;
val = __rdmsr(msr);
if (msr_tracepoint_active(__tracepoint_read_msr))
do_trace_read_msr(msr, val, 0);
return val;
}
static inline unsigned long long native_read_msr_safe(unsigned int msr,
int *err)
{
DECLARE_ARGS(val, low, high);
asm volatile("2: rdmsr ; xor %[err],%[err]\n"
"1:\n\t"
".section .fixup,\"ax\"\n\t"
"3: mov %[fault],%[err]\n\t"
"xorl %%eax, %%eax\n\t"
"xorl %%edx, %%edx\n\t"
"jmp 1b\n\t"
".previous\n\t"
_ASM_EXTABLE(2b, 3b)
: [err] "=r" (*err), EAX_EDX_RET(val, low, high)
: "c" (msr), [fault] "i" (-EIO));
if (msr_tracepoint_active(__tracepoint_read_msr))
do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
return EAX_EDX_VAL(val, low, high);
}
/* Can be uninlined because referenced by paravirt */
static inline void notrace
native_write_msr(unsigned int msr, u32 low, u32 high)
{
__wrmsr(msr, low, high);
if (msr_tracepoint_active(__tracepoint_write_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}
/* Can be uninlined because referenced by paravirt */
static inline int notrace
native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
int err;
asm volatile("2: wrmsr ; xor %[err],%[err]\n"
"1:\n\t"
".section .fixup,\"ax\"\n\t"
"3: mov %[fault],%[err] ; jmp 1b\n\t"
".previous\n\t"
_ASM_EXTABLE(2b, 3b)
: [err] "=a" (err)
: "c" (msr), "0" (low), "d" (high),
[fault] "i" (-EIO)
: "memory");
if (msr_tracepoint_active(__tracepoint_write_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), err);
return err;
}
extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);
/**
* rdtsc() - returns the current TSC without ordering constraints
*
* rdtsc() returns the result of RDTSC as a 64-bit integer. The
* only ordering constraint it supplies is the ordering implied by
* "asm volatile": it will put the RDTSC in the place you expect. The
* CPU can and will speculatively execute that RDTSC, though, so the
* results can be non-monotonic if compared on different CPUs.
*/
static __always_inline unsigned long long rdtsc(void)
{
DECLARE_ARGS(val, low, high);
asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); /*covered*/
return EAX_EDX_VAL(val, low, high);
}
/**
* rdtsc_ordered() - read the current TSC in program order
*
* rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
* It is ordered like a load to a global in-memory counter. It should
* be impossible to observe non-monotonic rdtsc_unordered() behavior
* across multiple CPUs as long as the TSC is synced.
*/
static __always_inline unsigned long long rdtsc_ordered(void)
{
/*
* The RDTSC instruction is not ordered relative to memory
* access. The Intel SDM and the AMD APM are both vague on this
* point, but empirically an RDTSC instruction can be
* speculatively executed before prior loads. An RDTSC
* immediately after an appropriate barrier appears to be
* ordered as a normal load, that is, it provides the same
* ordering guarantees as reading from a global memory location
* that some other imaginary CPU is updating continuously with a
* time stamp.
*/
barrier_nospec(); /*covered*/
return rdtsc();
}
static inline unsigned long long native_read_pmc(int counter)
{
DECLARE_ARGS(val, low, high);
asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
if (msr_tracepoint_active(__tracepoint_rdpmc))
do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
/*
* Access to machine-specific registers (available on 586 and better only)
* Note: the rd* operations modify the parameters directly (without using
* pointer indirection), this allows gcc to optimize better
*/
#define rdmsr(msr, low, high) \
do { \
u64 __val = native_read_msr((msr)); \
(void)((low) = (u32)__val); \
(void)((high) = (u32)(__val >> 32)); \
} while (0)
static inline void wrmsr(unsigned int msr, u32 low, u32 high)
{
native_write_msr(msr, low, high);
}
#define rdmsrl(msr, val) \
((val) = native_read_msr((msr)))
static inline void wrmsrl(unsigned int msr, u64 val)
{
native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}
/* wrmsr with exception handling */
static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high)
{
return native_write_msr_safe(msr, low, high);
}
/* rdmsr with exception handling */
#define rdmsr_safe(msr, low, high) \
({ \
int __err; \
u64 __val = native_read_msr_safe((msr), &__err); \
(*low) = (u32)__val; \
(*high) = (u32)(__val >> 32); \
__err; \
})
static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p)
{
int err;
*p = native_read_msr_safe(msr, &err);
return err;
}
#define rdpmc(counter, low, high) \
do { \
u64 _l = native_read_pmc((counter)); \
(low) = (u32)_l; \
(high) = (u32)(_l >> 32); \
} while (0)
#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
#endif /* !CONFIG_PARAVIRT_XXL */
/*
* 64-bit version of wrmsr_safe():
*/
static inline int wrmsrl_safe(u32 msr, u64 val)
{
return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
}
#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
struct msr *msrs_alloc(void);
void msrs_free(struct msr *msrs);
int msr_set_bit(u32 msr, u8 bit);
int msr_clear_bit(u32 msr, u8 bit);
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
#else /* CONFIG_SMP */
static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
rdmsr(msr_no, *l, *h);
return 0;
}
static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
wrmsr(msr_no, l, h);
return 0;
}
static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
rdmsrl(msr_no, *q);
return 0;
}
static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
wrmsrl(msr_no, q);
return 0;
}
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
struct msr *msrs)
{
rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
struct msr *msrs)
{
wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
u32 *l, u32 *h)
{
return rdmsr_safe(msr_no, l, h);
}
static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
return wrmsr_safe(msr_no, l, h);
}
static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
return rdmsrl_safe(msr_no, q);
}
static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
return wrmsrl_safe(msr_no, q);
}
static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
return rdmsr_safe_regs(regs);
}
static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
return wrmsr_safe_regs(regs);
}
#endif /* CONFIG_SMP */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_MSR_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PARAVIRT_H
#define _ASM_X86_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
* para-virtualization: those hooks are defined here. */
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
#include <asm/nospec-branch.h>
#include <asm/paravirt_types.h>
#ifndef __ASSEMBLY__
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
#include <asm/frame.h>
static inline unsigned long long paravirt_sched_clock(void)
{
return PVOP_CALL0(unsigned long long, time.sched_clock);
}
struct static_key;
extern struct static_key paravirt_steal_enabled;
extern struct static_key paravirt_steal_rq_enabled;
__visible void __native_queued_spin_unlock(struct qspinlock *lock);
bool pv_is_native_spin_unlock(void);
__visible bool __native_vcpu_is_preempted(long cpu);
bool pv_is_native_vcpu_is_preempted(void);
static inline u64 paravirt_steal_clock(int cpu)
{
return PVOP_CALL1(u64, time.steal_clock, cpu);
}
/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
pv_ops.cpu.io_delay();
#ifdef REALLY_SLOW_IO
pv_ops.cpu.io_delay();
pv_ops.cpu.io_delay();
pv_ops.cpu.io_delay();
#endif
}
static inline void __flush_tlb(void)
{
PVOP_VCALL0(mmu.flush_tlb_user);
}
static inline void __flush_tlb_global(void)
{
PVOP_VCALL0(mmu.flush_tlb_kernel);
}
static inline void __flush_tlb_one_user(unsigned long addr)
{
PVOP_VCALL1(mmu.flush_tlb_one_user, addr); /*covered*/
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
}
static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
}
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
PVOP_VCALL1(mmu.exit_mmap, mm);
}
#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
PVOP_VCALL1(cpu.load_sp0, sp0);
}
/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
}
/*
* These special macros can be used to get or set a debugging register
*/
static inline unsigned long paravirt_get_debugreg(int reg)
{
return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
PVOP_VCALL2(cpu.set_debugreg, reg, val);
}
static inline unsigned long read_cr0(void)
{
return PVOP_CALL0(unsigned long, cpu.read_cr0);
}
static inline void write_cr0(unsigned long x)
{
PVOP_VCALL1(cpu.write_cr0, x);
}
static inline unsigned long read_cr2(void)
{
return PVOP_CALL0(unsigned long, mmu.read_cr2); /*covered*/
}
static inline void write_cr2(unsigned long x)
{
PVOP_VCALL1(mmu.write_cr2, x);
}
static inline unsigned long __read_cr3(void)
{
return PVOP_CALL0(unsigned long, mmu.read_cr3); /*covered*/
}
static inline void write_cr3(unsigned long x)
{
PVOP_VCALL1(mmu.write_cr3, x);
}
static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(cpu.write_cr4, x);
}
#ifdef CONFIG_X86_64
static inline unsigned long read_cr8(void)
{
return PVOP_CALL0(unsigned long, cpu.read_cr8);
}
static inline void write_cr8(unsigned long x)
{
PVOP_VCALL1(cpu.write_cr8, x);
}
#endif
static inline void arch_safe_halt(void)
{
PVOP_VCALL0(irq.safe_halt);
}
static inline void halt(void)
{
PVOP_VCALL0(irq.halt);
}
static inline void wbinvd(void)
{
PVOP_VCALL0(cpu.wbinvd);
}
#define get_kernel_rpl() (pv_info.kernel_rpl)
static inline u64 paravirt_read_msr(unsigned msr)
{
return PVOP_CALL1(u64, cpu.read_msr, msr); /*covered*/
}
static inline void paravirt_write_msr(unsigned msr,
unsigned low, unsigned high)
{
PVOP_VCALL3(cpu.write_msr, msr, low, high);
}
static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
{
return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
}
static inline int paravirt_write_msr_safe(unsigned msr,
unsigned low, unsigned high)
{
return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high);
}
#define rdmsr(msr, val1, val2) \
do { \
u64 _l = paravirt_read_msr(msr); \
val1 = (u32)_l; \
val2 = _l >> 32; \
} while (0)
#define wrmsr(msr, val1, val2) \
do { \
paravirt_write_msr(msr, val1, val2); \
} while (0)
#define rdmsrl(msr, val) \
do { \
val = paravirt_read_msr(msr); \
} while (0)
static inline void wrmsrl(unsigned msr, u64 val)
{
wrmsr(msr, (u32)val, (u32)(val>>32));
}
#define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b)
/* rdmsr with exception handling */
#define rdmsr_safe(msr, a, b) \
({ \
int _err; \
u64 _l = paravirt_read_msr_safe(msr, &_err); \
(*a) = (u32)_l; \
(*b) = _l >> 32; \
_err; \
})
static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
{
int err;
*p = paravirt_read_msr_safe(msr, &err);
return err;
}
static inline unsigned long long paravirt_read_pmc(int counter)
{
return PVOP_CALL1(u64, cpu.read_pmc, counter);
}
#define rdpmc(counter, low, high) \
do { \
u64 _l = paravirt_read_pmc(counter); \
low = (u32)_l; \
high = _l >> 32; \
} while (0)
#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
}
static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
PVOP_VCALL2(cpu.free_ldt, ldt, entries);
}
static inline void load_TR_desc(void)
{
PVOP_VCALL0(cpu.load_tr_desc);
}
static inline void load_gdt(const struct desc_ptr *dtr)
{
PVOP_VCALL1(cpu.load_gdt, dtr);
}
static inline void load_idt(const struct desc_ptr *dtr)
{
PVOP_VCALL1(cpu.load_idt, dtr);
}
static inline void set_ldt(const void *addr, unsigned entries)
{
PVOP_VCALL2(cpu.set_ldt, addr, entries);
}
static inline unsigned long paravirt_store_tr(void)
{
return PVOP_CALL0(unsigned long, cpu.store_tr);
}
#define store_tr(tr) ((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
PVOP_VCALL2(cpu.load_tls, t, cpu);
}
#ifdef CONFIG_X86_64
static inline void load_gs_index(unsigned int gs)
{
PVOP_VCALL1(cpu.load_gs_index, gs);
}
#endif
static inline void write_ldt_entry(struct desc_struct *dt, int entry,
const void *desc)
{
PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc);
}
static inline void write_gdt_entry(struct desc_struct *dt, int entry,
void *desc, int type)
{
PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type);
}
static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{
PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
}
static inline void set_iopl_mask(unsigned mask)
{
PVOP_VCALL1(cpu.set_iopl_mask, mask);
}
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
PVOP_VCALL2(mmu.activate_mm, prev, next);
}
static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
PVOP_VCALL2(mmu.dup_mmap, oldmm, mm);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
return PVOP_CALL1(int, mmu.pgd_alloc, mm);
}
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
PVOP_VCALL2(mmu.pgd_free, mm, pgd); /*covered*/
}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
PVOP_VCALL2(mmu.alloc_pte, mm, pfn); /*covered*/
}
static inline void paravirt_release_pte(unsigned long pfn)
{
PVOP_VCALL1(mmu.release_pte, pfn);
}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
PVOP_VCALL2(mmu.alloc_pmd, mm, pfn);
}
static inline void paravirt_release_pmd(unsigned long pfn)
{
PVOP_VCALL1(mmu.release_pmd, pfn);
}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
PVOP_VCALL2(mmu.alloc_pud, mm, pfn);
}
static inline void paravirt_release_pud(unsigned long pfn)
{
PVOP_VCALL1(mmu.release_pud, pfn);
}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
{
PVOP_VCALL2(mmu.alloc_p4d, mm, pfn);
}
static inline void paravirt_release_p4d(unsigned long pfn)
{
PVOP_VCALL1(mmu.release_p4d, pfn);
}
static inline pte_t __pte(pteval_t val)
{
pteval_t ret;
if (sizeof(pteval_t) > sizeof(long))
ret = PVOP_CALLEE2(pteval_t, mmu.make_pte, val, (u64)val >> 32);
else
ret = PVOP_CALLEE1(pteval_t, mmu.make_pte, val); /*covered*/
return (pte_t) { .pte = ret };
}
static inline pteval_t pte_val(pte_t pte)
{
pteval_t ret;
if (sizeof(pteval_t) > sizeof(long))
ret = PVOP_CALLEE2(pteval_t, mmu.pte_val,
pte.pte, (u64)pte.pte >> 32);
else
ret = PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); /*covered*/
return ret;
}
static inline pgd_t __pgd(pgdval_t val)
{
pgdval_t ret;
if (sizeof(pgdval_t) > sizeof(long))
ret = PVOP_CALLEE2(pgdval_t, mmu.make_pgd, val, (u64)val >> 32);
else
ret = PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val);
return (pgd_t) { ret };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
pgdval_t ret;
if (sizeof(pgdval_t) > sizeof(long))
ret = PVOP_CALLEE2(pgdval_t, mmu.pgd_val,
pgd.pgd, (u64)pgd.pgd >> 32);
else
ret = PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); /*covered*/
return ret;
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pteval_t ret;
ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep);
return (pte_t) { .pte = ret };
}
static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
if (sizeof(pteval_t) > sizeof(long))
/* 5 arg words */
pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte);
else
PVOP_VCALL4(mmu.ptep_modify_prot_commit,
mm, addr, ptep, pte.pte);
}
static inline void set_pte(pte_t *ptep, pte_t pte)
{
if (sizeof(pteval_t) > sizeof(long))
PVOP_VCALL3(mmu.set_pte, ptep, pte.pte, (u64)pte.pte >> 32);
else
PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); /*covered*/
}
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
if (sizeof(pteval_t) > sizeof(long))
/* 5 arg words */
pv_ops.mmu.set_pte_at(mm, addr, ptep, pte);
else
PVOP_VCALL4(mmu.set_pte_at, mm, addr, ptep, pte.pte); /*covered*/
}
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
pmdval_t val = native_pmd_val(pmd); /*covered*/
if (sizeof(pmdval_t) > sizeof(long))
PVOP_VCALL3(mmu.set_pmd, pmdp, val, (u64)val >> 32);
else
PVOP_VCALL2(mmu.set_pmd, pmdp, val); /*covered*/
}
#if CONFIG_PGTABLE_LEVELS >= 3
static inline pmd_t __pmd(pmdval_t val)
{
pmdval_t ret;
if (sizeof(pmdval_t) > sizeof(long))
ret = PVOP_CALLEE2(pmdval_t, mmu.make_pmd, val, (u64)val >> 32);
else
ret = PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val); /*covered*/
return (pmd_t) { ret };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
pmdval_t ret;
if (sizeof(pmdval_t) > sizeof(long))
ret = PVOP_CALLEE2(pmdval_t, mmu.pmd_val,
pmd.pmd, (u64)pmd.pmd >> 32);
else
ret = PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); /*covered*/
return ret;
}
static inline void set_pud(pud_t *pudp, pud_t pud)
{
pudval_t val = native_pud_val(pud);
if (sizeof(pudval_t) > sizeof(long))
PVOP_VCALL3(mmu.set_pud, pudp, val, (u64)val >> 32);
else
PVOP_VCALL2(mmu.set_pud, pudp, val);
}
#if CONFIG_PGTABLE_LEVELS >= 4
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud); /*covered*/
}
static inline void pud_clear(pud_t *pudp)
{
set_pud(pudp, __pud(0));
}
static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
p4dval_t val = native_p4d_val(p4d);
PVOP_VCALL2(mmu.set_p4d, p4dp, val);
}
#if CONFIG_PGTABLE_LEVELS >= 5
static inline p4d_t __p4d(p4dval_t val)
{
p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val);
return (p4d_t) { ret };
}
static inline p4dval_t p4d_val(p4d_t p4d)
{
return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d);
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd));
}
#define set_pgd(pgdp, pgdval) do { \
if (pgtable_l5_enabled()) \
__set_pgd(pgdp, pgdval); \
else \
set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
} while (0)
#define pgd_clear(pgdp) do { \
if (pgtable_l5_enabled()) \
set_pgd(pgdp, __pgd(0)); \
} while (0)
#endif /* CONFIG_PGTABLE_LEVELS == 5 */
static inline void p4d_clear(p4d_t *p4dp)
{
set_p4d(p4dp, __p4d(0));
}
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
#ifdef CONFIG_X86_PAE
/* Special-case pte-setting operations for PAE, which can't update a
64-bit pte atomically */
static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
PVOP_VCALL3(mmu.set_pte_atomic, ptep, pte.pte, pte.pte >> 32);
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
PVOP_VCALL3(mmu.pte_clear, mm, addr, ptep);
}
static inline void pmd_clear(pmd_t *pmdp)
{
PVOP_VCALL1(mmu.pmd_clear, pmdp);
}
#else /* !CONFIG_X86_PAE */
static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_pte(ptep, pte); /*covered*/
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
set_pte_at(mm, addr, ptep, __pte(0));
}
static inline void pmd_clear(pmd_t *pmdp)
{
set_pmd(pmdp, __pmd(0));
}
#endif /* CONFIG_X86_PAE */
#define __HAVE_ARCH_START_CONTEXT_SWITCH
static inline void arch_start_context_switch(struct task_struct *prev)
{
PVOP_VCALL1(cpu.start_context_switch, prev);
}
static inline void arch_end_context_switch(struct task_struct *next)
{
PVOP_VCALL1(cpu.end_context_switch, next);
}
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
PVOP_VCALL0(mmu.lazy_mode.enter);
}
static inline void arch_leave_lazy_mmu_mode(void)
{
PVOP_VCALL0(mmu.lazy_mode.leave);
}
static inline void arch_flush_lazy_mmu_mode(void)
{
PVOP_VCALL0(mmu.lazy_mode.flush);
}
static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags)
{
pv_ops.mmu.set_fixmap(idx, phys, flags);
}
#endif
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
u32 val)
{
PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val);
}
static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
PVOP_VCALLEE1(lock.queued_spin_unlock, lock);
}
static __always_inline void pv_wait(u8 *ptr, u8 val)
{
PVOP_VCALL2(lock.wait, ptr, val);
}
static __always_inline void pv_kick(int cpu)
{
PVOP_VCALL1(lock.kick, cpu);
}
static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
}
void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
#endif /* SMP && PARAVIRT_SPINLOCKS */
#ifdef CONFIG_X86_32
#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
#define PV_FLAGS_ARG "0"
#define PV_EXTRA_CLOBBERS
#define PV_VEXTRA_CLOBBERS
#else
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS \
"push %rcx;" \
"push %rdx;" \
"push %rsi;" \
"push %rdi;" \
"push %r8;" \
"push %r9;" \
"push %r10;" \
"push %r11;"
#define PV_RESTORE_ALL_CALLER_REGS \
"pop %r11;" \
"pop %r10;" \
"pop %r9;" \
"pop %r8;" \
"pop %rdi;" \
"pop %rsi;" \
"pop %rdx;" \
"pop %rcx;"
/* We save some registers, but all of them, that's too much. We clobber all
* caller saved registers but the argument parameter */
#define PV_SAVE_REGS "pushq %%rdi;"
#define PV_RESTORE_REGS "popq %%rdi;"
#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
#define PV_FLAGS_ARG "D"
#endif
/*
* Generate a thunk around a function which saves all caller-save
* registers except for the return value. This allows C functions to
* be called from assembler code where fewer than normal registers are
* available. It may also help code generation around calls from C
* code if the common case doesn't use many registers.
*
* When a callee is wrapped in a thunk, the caller can assume that all
* arg regs and all scratch registers are preserved across the
* call. The return value in rax/eax will not be saved, even for void
* functions.
*/
#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
#define PV_CALLEE_SAVE_REGS_THUNK(func) \
extern typeof(func) __raw_callee_save_##func; \
\
asm(".pushsection .text;" \
".globl " PV_THUNK_NAME(func) ";" \
".type " PV_THUNK_NAME(func) ", @function;" \
PV_THUNK_NAME(func) ":" \
FRAME_BEGIN \
PV_SAVE_ALL_CALLER_REGS \
"call " #func ";" \
PV_RESTORE_ALL_CALLER_REGS \
FRAME_END \
"ret;" \
".popsection")
/* Get a reference to a callee-save function */
#define PV_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { __raw_callee_save_##func })
/* Promise that "func" already uses the right calling convention */
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })
#ifdef CONFIG_PARAVIRT_XXL
static inline notrace unsigned long arch_local_save_flags(void)
{
return PVOP_CALLEE0(unsigned long, irq.save_fl); /*covered*/
}
static inline notrace void arch_local_irq_restore(unsigned long f)
{
PVOP_VCALLEE1(irq.restore_fl, f); /*covered*/
} /*covered*/
static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(irq.irq_disable); /*covered*/
}
static inline notrace void arch_local_irq_enable(void)
{
PVOP_VCALLEE0(irq.irq_enable); /*covered*/
}
static inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long f;
f = arch_local_save_flags(); /*covered*/
arch_local_irq_disable(); /*covered*/
return f;
}
#endif
/* Make sure as little as possible of this mess escapes. */
#undef PARAVIRT_CALL
#undef __PVOP_CALL
#undef __PVOP_VCALL
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4
extern void default_banner(void);
#else /* __ASSEMBLY__ */
#define _PVSITE(ptype, ops, word, algn) \
771:; \
ops; \
772:; \
.pushsection .parainstructions,"a"; \
.align algn; \
word 771b; \
.byte ptype; \
.byte 772b-771b; \
.popsection
#define COND_PUSH(set, mask, reg) \
.if ((~(set)) & mask); push %reg; .endif
#define COND_POP(set, mask, reg) \
.if ((~(set)) & mask); pop %reg; .endif
#ifdef CONFIG_X86_64
#define PV_SAVE_REGS(set) \
COND_PUSH(set, CLBR_RAX, rax); \
COND_PUSH(set, CLBR_RCX, rcx); \
COND_PUSH(set, CLBR_RDX, rdx); \
COND_PUSH(set, CLBR_RSI, rsi); \
COND_PUSH(set, CLBR_RDI, rdi); \
COND_PUSH(set, CLBR_R8, r8); \
COND_PUSH(set, CLBR_R9, r9); \
COND_PUSH(set, CLBR_R10, r10); \
COND_PUSH(set, CLBR_R11, r11)
#define PV_RESTORE_REGS(set) \
COND_POP(set, CLBR_R11, r11); \
COND_POP(set, CLBR_R10, r10); \
COND_POP(set, CLBR_R9, r9); \
COND_POP(set, CLBR_R8, r8); \
COND_POP(set, CLBR_RDI, rdi); \
COND_POP(set, CLBR_RSI, rsi); \
COND_POP(set, CLBR_RDX, rdx); \
COND_POP(set, CLBR_RCX, rcx); \
COND_POP(set, CLBR_RAX, rax)
#define PARA_PATCH(off) ((off) / 8)
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
#else
#define PV_SAVE_REGS(set) \
COND_PUSH(set, CLBR_EAX, eax); \
COND_PUSH(set, CLBR_EDI, edi); \
COND_PUSH(set, CLBR_ECX, ecx); \
COND_PUSH(set, CLBR_EDX, edx)
#define PV_RESTORE_REGS(set) \
COND_POP(set, CLBR_EDX, edx); \
COND_POP(set, CLBR_ECX, ecx); \
COND_POP(set, CLBR_EDI, edi); \
COND_POP(set, CLBR_EAX, eax)
#define PARA_PATCH(off) ((off) / 4)
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4)
#define PARA_INDIRECT(addr) *%cs:addr
#endif
#ifdef CONFIG_PARAVIRT_XXL
#define INTERRUPT_RETURN \
PARA_SITE(PARA_PATCH(PV_CPU_iret), \
ANNOTATE_RETPOLINE_SAFE; \
jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);)
#define DISABLE_INTERRUPTS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
ANNOTATE_RETPOLINE_SAFE; \
call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#define ENABLE_INTERRUPTS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
ANNOTATE_RETPOLINE_SAFE; \
call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
/*
* If swapgs is used while the userspace stack is still current,
* there's no way to call a pvop. The PV replacement *must* be
* inlined, or the swapgs instruction must be trapped and emulated.
*/
#define SWAPGS_UNSAFE_STACK \
PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
/*
* Note: swapgs is very special, and in practise is either going to be
* implemented with a single "swapgs" instruction or something very
* special. Either way, we don't need to save any registers for
* it.
*/
#define SWAPGS \
PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \
ANNOTATE_RETPOLINE_SAFE; \
call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
)
#endif
#define GET_CR2_INTO_RAX \
ANNOTATE_RETPOLINE_SAFE; \
call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);
#ifdef CONFIG_PARAVIRT_XXL
#define USERGS_SYSRET64 \
PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \
ANNOTATE_RETPOLINE_SAFE; \
jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);)
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
ANNOTATE_RETPOLINE_SAFE; \
call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
#endif
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */
#else /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop
#endif /* !CONFIG_PARAVIRT */
#ifndef __ASSEMBLY__
#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
}
#endif
#ifndef CONFIG_PARAVIRT
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PARAVIRT_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGALLOC_H
#define _ASM_X86_PGALLOC_H
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */
#include <linux/pagemap.h>
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif
/*
* Flags to use when allocating a user page table page.
*/
extern gfp_t __userpte_alloc_gfp;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Instead of one PGD, we acquire two PGDs. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12
* in a pointer to swap between the two 4k halves.
*/
#define PGD_ALLOCATION_ORDER 1
#else
#define PGD_ALLOCATION_ORDER 0
#endif
/*
* Allocate and free page tables.
*/
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
extern pte_t *pte_alloc_one_kernel(struct mm_struct *);
extern pgtable_t pte_alloc_one(struct mm_struct *);
/* Should really implement gc for free page table pages. This could be
done with a reference count in struct page. */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
free_page((unsigned long)pte);
}
static inline void pte_free(struct mm_struct *mm, struct page *pte)
{
pgtable_page_dtor(pte); /*covered*/
__free_page(pte);
}
extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
unsigned long address)
{
___pte_free_tlb(tlb, pte);
}
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
{
paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); /*covered*/
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); /*covered*/
}
static inline void pmd_populate_kernel_safe(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
{
paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
struct page *pte)
{
unsigned long pfn = page_to_pfn(pte);
paravirt_alloc_pte(mm, pfn);
set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}
#define pmd_pgtable(pmd) pmd_page(pmd)
#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
struct page *page;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
page = alloc_pages(gfp, 0);
if (!page)
return NULL;
if (!pgtable_pmd_page_ctor(page)) {
__free_pages(page, 0);
return NULL;
}
return (pmd_t *)page_address(page);
}
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
pgtable_pmd_page_dtor(virt_to_page(pmd));
free_page((unsigned long)pmd);
}
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long address)
{
___pmd_free_tlb(tlb, pmd);
}
#ifdef CONFIG_X86_PAE
extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
#else /* !CONFIG_X86_PAE */
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
#endif /* CONFIG_X86_PAE */
#if CONFIG_PGTABLE_LEVELS > 3
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
return (pud_t *)get_zeroed_page(gfp);
}
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
free_page((unsigned long)pud);
}
extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
unsigned long address)
{
___pud_free_tlb(tlb, pud);
}
#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
if (!pgtable_l5_enabled())
return;
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}
static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
if (!pgtable_l5_enabled())
return;
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
return (p4d_t *)get_zeroed_page(gfp);
}
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
if (!pgtable_l5_enabled())
return;
BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
free_page((unsigned long)p4d);
}
extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long address)
{
if (pgtable_l5_enabled())
___p4d_free_tlb(tlb, p4d);
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* _ASM_X86_PGALLOC_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1
#ifndef __ASSEMBLY__
/*
* A clear pte value is special, and doesn't get inverted.
*
* Note that even users that only pass a pgprot_t (rather
* than a full pte) won't trigger the special zero case,
* because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
* set. So the all zero case really is limited to just the
* cleared page table entry case.
*/
static inline bool __pte_needs_invert(u64 val)
{
return val && !(val & _PAGE_PRESENT); /*covered*/
}
/* Get a mask to xor with the page table entry to get the correct pfn. */
static inline u64 protnone_mask(u64 val)
{
return __pte_needs_invert(val) ? ~0ull : 0; /*covered*/
}
static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
/*
* When a PTE transitions from NONE to !NONE or vice-versa
* invert the PFN part to stop speculation.
* pte_pfn undoes this when needed.
*/
if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
val = (val & ~mask) | (~val & mask);
return val;
}
#endif /* __ASSEMBLY__ */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H
#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
/*
* Macro to mark a page protection value as UC-
*/
#define pgprot_noncached(prot) \
((boot_cpu_data.x86 > 3) \
? (__pgprot(pgprot_val(prot) | \
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
: (prot))
/*
* Macros to add or remove encryption attribute
*/
#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot)))
#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
void ptdump_walk_pgd_level_checkwx(void);
void ptdump_walk_user_pgd_level_checkwx(void);
#ifdef CONFIG_DEBUG_WX
#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx() do { } while (0)
#define debug_checkwx_user() do { } while (0)
#endif
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
*/
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
__visible;
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
extern struct mm_struct *pgd_page_get_mm(struct page *page);
extern pmdval_t early_pmd_flags;
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif
#ifndef set_p4d
# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d)
#endif
#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d) native_p4d_clear(p4d)
#endif
#ifndef set_pud
# define set_pud(pudp, pud) native_set_pud(pudp, pud)
#endif
#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud) native_pud_clear(pud)
#endif
#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd) native_pmd_clear(pmd)
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x) native_p4d_val(x)
#define __p4d(x) native_make_p4d(x)
#endif
#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x) native_pud_val(x)
#define __pud(x) native_make_pud(x)
#endif
#ifndef __PAGETABLE_PMD_FOLDED
#define pmd_val(x) native_pmd_val(x)
#define __pmd(x) native_make_pmd(x)
#endif
#define pte_val(x) native_pte_val(x)
#define __pte(x) native_make_pte(x)
#define arch_end_context_switch(prev) do {} while(0)
#endif /* CONFIG_PARAVIRT_XXL */
/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
static inline int pte_dirty(pte_t pte)
{
return pte_flags(pte) & _PAGE_DIRTY;
}
static inline u32 read_pkru(void)
{
if (boot_cpu_has(X86_FEATURE_OSPKE)) /*covered*/
return __read_pkru();
return 0;
}
static inline void write_pkru(u32 pkru)
{
if (boot_cpu_has(X86_FEATURE_OSPKE))
__write_pkru(pkru);
}
static inline int pte_young(pte_t pte)
{
return pte_flags(pte) & _PAGE_ACCESSED;
}
static inline int pmd_dirty(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_DIRTY;
}
static inline int pmd_young(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
static inline int pud_dirty(pud_t pud)
{
return pud_flags(pud) & _PAGE_DIRTY;
}
static inline int pud_young(pud_t pud)
{
return pud_flags(pud) & _PAGE_ACCESSED;
}
static inline int pte_write(pte_t pte)
{
return pte_flags(pte) & _PAGE_RW; /*covered*/
}
static inline int pte_huge(pte_t pte)
{
return pte_flags(pte) & _PAGE_PSE;
}
static inline int pte_global(pte_t pte)
{
return pte_flags(pte) & _PAGE_GLOBAL;
}
static inline int pte_exec(pte_t pte)
{
return !(pte_flags(pte) & _PAGE_NX);
}
static inline int pte_special(pte_t pte)
{
return pte_flags(pte) & _PAGE_SPECIAL; /*covered*/
}
/* Entries that were set to PROT_NONE are inverted */
static inline u64 protnone_mask(u64 val);
static inline unsigned long pte_pfn(pte_t pte)
{
phys_addr_t pfn = pte_val(pte); /*covered*/
pfn ^= protnone_mask(pfn); /*covered*/
return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; /*covered*/
}
static inline unsigned long pmd_pfn(pmd_t pmd)
{
phys_addr_t pfn = pmd_val(pmd); /*covered*/
pfn ^= protnone_mask(pfn); /*covered*/
return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; /*covered*/
}
static inline unsigned long pud_pfn(pud_t pud)
{
phys_addr_t pfn = pud_val(pud); /*covered*/
pfn ^= protnone_mask(pfn); /*covered*/
return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; /*covered*/
}
static inline unsigned long p4d_pfn(p4d_t p4d)
{
return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}
static inline unsigned long pgd_pfn(pgd_t pgd)
{
return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}
static inline int p4d_large(p4d_t p4d)
{
/* No 512 GiB pages yet */
return 0;
}
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
static inline int pmd_large(pmd_t pte)
{
return pmd_flags(pte) & _PAGE_PSE; /*covered*/
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; /*covered*/
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
#endif
#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
return boot_cpu_has(X86_FEATURE_PSE);
}
#ifdef __HAVE_ARCH_PTE_DEVMAP
static inline int pmd_devmap(pmd_t pmd)
{
return !!(pmd_val(pmd) & _PAGE_DEVMAP); /*covered*/
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_devmap(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_DEVMAP); /*covered*/
}
#else
static inline int pud_devmap(pud_t pud)
{
return 0;
}
#endif
static inline int pgd_devmap(pgd_t pgd)
{
return 0;
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
pteval_t v = native_pte_val(pte);
return native_make_pte(v | set); /*covered*/
}
static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
{
pteval_t v = native_pte_val(pte);
return native_make_pte(v & ~clear);
}
static inline pte_t pte_mkclean(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_DIRTY);
}
static inline pte_t pte_mkold(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_ACCESSED);
}
static inline pte_t pte_wrprotect(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_RW);
}
static inline pte_t pte_mkexec(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_NX);
}
static inline pte_t pte_mkdirty(pte_t pte)
{
return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
{
return pte_set_flags(pte, _PAGE_ACCESSED); /*covered*/
}
static inline pte_t pte_mkwrite(pte_t pte)
{
return pte_set_flags(pte, _PAGE_RW); /*covered*/
}
static inline pte_t pte_mkhuge(pte_t pte)
{
return pte_set_flags(pte, _PAGE_PSE);
}
static inline pte_t pte_clrhuge(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_PSE);
}
static inline pte_t pte_mkglobal(pte_t pte)
{
return pte_set_flags(pte, _PAGE_GLOBAL);
}
static inline pte_t pte_clrglobal(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_GLOBAL);
}
static inline pte_t pte_mkspecial(pte_t pte)
{
return pte_set_flags(pte, _PAGE_SPECIAL); /*covered*/
}
static inline pte_t pte_mkdevmap(pte_t pte)
{
return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
}
static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
pmdval_t v = native_pmd_val(pmd);
return native_make_pmd(v | set); /*covered*/
}
static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
{
pmdval_t v = native_pmd_val(pmd);
return native_make_pmd(v & ~clear);
}
static inline pmd_t pmd_mkold(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_DIRTY);
}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DEVMAP);
}
static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_PSE);
}
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_ACCESSED); /*covered*/
}
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_RW); /*covered*/
}
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
pudval_t v = native_pud_val(pud);
return native_make_pud(v | set);
}
static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
{
pudval_t v = native_pud_val(pud);
return native_make_pud(v & ~clear);
}
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
}
static inline pud_t pud_mkclean(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_DIRTY);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_RW);
}
static inline pud_t pud_mkdirty(pud_t pud)
{
return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
{
return pud_set_flags(pud, _PAGE_DEVMAP);
}
static inline pud_t pud_mkhuge(pud_t pud)
{
return pud_set_flags(pud, _PAGE_PSE);
}
static inline pud_t pud_mkyoung(pud_t pud)
{
return pud_set_flags(pud, _PAGE_ACCESSED);
}
static inline pud_t pud_mkwrite(pud_t pud)
{
return pud_set_flags(pud, _PAGE_RW);
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}
static inline int pmd_soft_dirty(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
}
static inline int pud_soft_dirty(pud_t pud)
{
return pud_flags(pud) & _PAGE_SOFT_DIRTY;
}
static inline pte_t pte_mksoft_dirty(pte_t pte)
{
return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
}
static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mksoft_dirty(pud_t pud)
{
return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
}
static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_clear_soft_dirty(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
}
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
*/
static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
pgprotval_t protval = pgprot_val(pgprot);
if (protval & _PAGE_PRESENT)
protval &= __supported_pte_mask; /*covered*/
return protval;
}
static inline pgprotval_t check_pgprot(pgprot_t pgprot)
{
pgprotval_t massaged_val = massage_pgprot(pgprot); /*covered*/
/* mmdebug.h can not be included here because of dependencies */
#ifdef CONFIG_DEBUG_VM
WARN_ONCE(pgprot_val(pgprot) != massaged_val, /*covered*/
"attempted to set unsupported pgprot: %016llx "
"bits: %016llx supported: %016llx\n",
(u64)pgprot_val(pgprot),
(u64)pgprot_val(pgprot) ^ massaged_val,
(u64)__supported_pte_mask);
#endif
return massaged_val;
}
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; /*covered*/
pfn ^= protnone_mask(pgprot_val(pgprot)); /*covered*/
pfn &= PTE_PFN_MASK; /*covered*/
return __pte(pfn | check_pgprot(pgprot)); /*covered*/
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
pfn ^= protnone_mask(pgprot_val(pgprot)); /*covered*/
pfn &= PHYSICAL_PMD_PAGE_MASK; /*covered*/
return __pmd(pfn | check_pgprot(pgprot)); /*covered*/
}
static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PHYSICAL_PUD_PAGE_MASK;
return __pud(pfn | check_pgprot(pgprot));
}
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
return pfn_pmd(pmd_pfn(pmd),
__pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}
static inline pud_t pud_mknotpresent(pud_t pud)
{
return pfn_pud(pud_pfn(pud),
__pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}
static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pteval_t val = pte_val(pte), oldval = val;
/*
* Chop off the NX bit (if present), and add the NX portion of
* the newprot (if present):
*/
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
return __pte(val);
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
pmdval_t val = pmd_val(pmd), oldval = val;
val &= _HPAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
return __pmd(val);
}
/* mprotect needs to preserve PAT bits when updating vm_page_prot */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
pgprotval_t addbits = pgprot_val(newprot);
return __pgprot(preservebits | addbits);
}
#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
{
return canon_pgprot(prot);
}
static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
enum page_cache_mode pcm,
enum page_cache_mode new_pcm)
{
/*
* PAT type is always WB for untracked ranges, so no need to check.
*/
if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
return 1;
/*
* Certain new memtypes are not allowed with certain
* requested memtype:
* - request is uncached, return cannot be write-back
* - request is write-combine, return cannot be write-back
* - request is write-through, return cannot be write-back
* - request is write-through, return cannot be write-combine
*/
if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
new_pcm == _PAGE_CACHE_MODE_WB) ||
(pcm == _PAGE_CACHE_MODE_WC &&
new_pcm == _PAGE_CACHE_MODE_WB) ||
(pcm == _PAGE_CACHE_MODE_WT &&
new_pcm == _PAGE_CACHE_MODE_WB) ||
(pcm == _PAGE_CACHE_MODE_WT &&
new_pcm == _PAGE_CACHE_MODE_WC)) {
return 0;
}
return 1;
}
pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);
/*
* Take a PGD location (pgdp) and a pgd value that needs to be set there.
* Populates the user and returns the resulting PGD that must be set in
* the kernel copy of the page tables.
*/
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return pgd;
return __pti_set_user_pgtbl(pgdp, pgd);
}
#else /* CONFIG_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
#endif /* __ASSEMBLY__ */
#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
# include <asm/pgtable_64.h>
#endif
#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>
static inline int pte_none(pte_t pte)
{
return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK)); /*covered*/
}
#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t a, pte_t b)
{
return a.pte == b.pte;
}
static inline int pte_present(pte_t a)
{
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); /*covered*/
}
#ifdef __HAVE_ARCH_PTE_DEVMAP
static inline int pte_devmap(pte_t a)
{
return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
}
#endif
#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
if (pte_flags(a) & _PAGE_PRESENT)
return true;
if ((pte_flags(a) & _PAGE_PROTNONE) &&
mm_tlb_flush_pending(mm))
return true;
return false;
}
static inline int pmd_present(pmd_t pmd)
{
/*
* Checking for _PAGE_PSE is needed too because
* split_huge_page will temporarily clear the present bit (but
* the _PAGE_PSE flag will remain set at all times while the
* _PAGE_PRESENT bit is clear).
*/
return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); /*covered*/
}
#ifdef CONFIG_NUMA_BALANCING
/*
* These work without NUMA balancing but the kernel does not care. See the
* comment in include/asm-generic/pgtable.h
*/
static inline int pte_protnone(pte_t pte)
{
return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) /*covered*/
== _PAGE_PROTNONE;
}
static inline int pmd_protnone(pmd_t pmd)
{
return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) /*covered*/
== _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */
static inline int pmd_none(pmd_t pmd)
{
/* Only check low word on 32-bit platforms, since it might be
out of sync with upper half. */
unsigned long val = native_pmd_val(pmd);
return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0; /*covered*/
}
static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); /*covered*/
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
/*
* the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
*
* this macro returns the index of the entry in the pmd page which would
* control the given virtual address
*/
static inline unsigned long pmd_index(unsigned long address)
{
return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); /*covered*/
}
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*
* (Currently stuck as a macro because of indirect forward reference
* to linux/mm.h:page_to_nid())
*/
#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
/*
* the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
*
* this function returns the index of the entry in the pte page which would
* control the given virtual address
*/
static inline unsigned long pte_index(unsigned long address)
{
return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); /*covered*/
}
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); /*covered*/
}
static inline int pmd_bad(pmd_t pmd)
{
return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; /*covered*/
}
static inline unsigned long pages_to_mb(unsigned long npg)
{
return npg >> (20 - PAGE_SHIFT);
}
#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; /*covered*/
}
static inline int pud_present(pud_t pud)
{
return pud_flags(pud) & _PAGE_PRESENT; /*covered*/
}
static inline unsigned long pud_page_vaddr(pud_t pud)
{
return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); /*covered*/
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
#define pud_page(pud) pfn_to_page(pud_pfn(pud))
/* Find an entry in the second-level page table.. */
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); /*covered*/
}
static inline int pud_large(pud_t pud)
{
return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == /*covered*/
(_PAGE_PSE | _PAGE_PRESENT);
}
static inline int pud_bad(pud_t pud)
{
return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; /*covered*/
}
#else
static inline int pud_large(pud_t pud)
{
return 0;
}
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
static inline unsigned long pud_index(unsigned long address)
{
return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); /*covered*/
}
#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}
static inline int p4d_present(p4d_t p4d)
{
return p4d_flags(p4d) & _PAGE_PRESENT;
}
static inline unsigned long p4d_page_vaddr(p4d_t p4d)
{
return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); /*covered*/
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d))
/* Find an entry in the third-level page table.. */
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); /*covered*/
}
static inline int p4d_bad(p4d_t p4d)
{
unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (p4d_flags(p4d) & ~ignore_flags) != 0; /*covered*/
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
static inline unsigned long p4d_index(unsigned long address)
{
return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}
#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
if (!pgtable_l5_enabled())
return 1;
return pgd_flags(pgd) & _PAGE_PRESENT;
}
static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd))
/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
if (!pgtable_l5_enabled())
return (p4d_t *)pgd;
return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}
static inline int pgd_bad(pgd_t pgd)
{
unsigned long ignore_flags = _PAGE_USER;
if (!pgtable_l5_enabled())
return 0;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
{
if (!pgtable_l5_enabled())
return 0;
/*
* There is no need to do a workaround for the KNL stray
* A/D bit erratum here. PGDs only point to page tables
* except on 32-bit non-PAE which is not supported on
* KNL.
*/
return !native_pgd_val(pgd);
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* __ASSEMBLY__ */
/*
* the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
*
* this macro returns the index of the entry in the pgd page which would
* control the given virtual address
*/
#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
/*
* pgd_offset() returns a (pgd_t *)
* pgd_index() is used get the offset into the pgd page's array of pgd_t's;
*/
#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
/*
* a shortcut to get a pgd_t in a given mm
*/
#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
/*
* a shortcut which implies the use of the kernel's pgd, instead
* of a process's
*/
#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
#ifndef __ASSEMBLY__
extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
extern void memblock_find_dma_reserve(void);
#ifdef CONFIG_X86_64
/* Realmode trampoline initialization. */
extern pgd_t trampoline_pgd_entry;
static inline void __meminit init_trampoline_default(void)
{
/* Default trampoline pgd value */
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
}
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
# else
# define init_trampoline init_trampoline_default
# endif
#else
static inline void init_trampoline(void) { }
#endif
/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
pte_t res = *ptep;
/* Pure native function needs no input for mm, addr */
native_pte_clear(NULL, 0, ptep);
return res;
}
static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
pmd_t res = *pmdp;
native_pmd_clear(pmdp);
return res;
}
static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
{
pud_t res = *pudp;
native_pud_clear(pudp);
return res;
}
static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , pte_t pte)
{
native_set_pte(ptep, pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
set_pmd(pmdp, pmd); /*covered*/
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
native_set_pud(pudp, pud);
}
/*
* We only update the dirty/accessed state if we set
* the dirty bit by hand in the kernel, since the hardware
* will do the accessed bit for us, and we don't want to
* race with other CPU's that might be updating the dirty
* bit at the same time.
*/
struct vm_area_struct;
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep);
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep); /*covered*/
return pte;
}
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
unsigned long addr, pte_t *ptep,
int full)
{
pte_t pte;
if (full) {
/*
* Full address destruction in progress; paravirt does not
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
return pte;
}
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
}
#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
pud_t entry, int dirty);
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp);
extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp);
#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_RW; /*covered*/
}
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
return native_pmdp_get_and_clear(pmdp); /*covered*/
}
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
return native_pudp_get_and_clear(pudp);
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
}
#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
return pud_flags(pud) & _PAGE_RW;
}
#ifndef pmdp_establish
#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd);
} else {
pmd_t old = *pmdp;
WRITE_ONCE(*pmdp, pmd);
return old;
}
}
#endif
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
*
* Returns true for parts of the PGD that map userspace and
* false for the parts that map the kernel.
*/
static inline bool pgdp_maps_userspace(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;
return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}
static inline int pgd_large(pgd_t pgd) { return 0; }
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
* (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
* the user one is in the last 4k. To switch between them, you
* just need to flip the 12th bit in their addresses.
*/
#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
/*
* This generates better code than the inline assembly in
* __set_bit().
*/
static inline void *ptr_set_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr |= BIT(bit);
return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr &= ~BIT(bit);
return (void *)__ptr;
}
static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
* dst - pointer to pgd range anwhere on a pgd page
* src - ""
* count - the number of pgds to copy.
*
* dst and src can be on the same page, but the range must not overlap,
* and must not cross a page boundary.
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/* Clone the user space pgd as well */
memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
count * sizeof(pgd_t));
#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
static inline int page_level_shift(enum pg_level level)
{
return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
}
static inline unsigned long page_level_size(enum pg_level level)
{
return 1UL << page_level_shift(level);
}
static inline unsigned long page_level_mask(enum pg_level level)
{
return ~(page_level_size(level) - 1);
}
/*
* The x86 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
*/
static inline void update_mmu_cache(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd)
{
}
static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
static inline int pte_swp_soft_dirty(pte_t pte)
{
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY; /*covered*/
}
static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
#endif
#define PKRU_AD_BIT 0x1
#define PKRU_WD_BIT 0x2
#define PKRU_BITS_PER_PKEY 2
static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}
static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
/*
* Access-disable disables writes too so we need to check
* both bits here.
*/
return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}
static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
/* ifdef to avoid doing 59-bit shift on 32-bit values */
return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
return 0;
#endif
}
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru(); /*covered*/
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey)) /*covered*/
return false;
return true;
}
/*
* 'pteval' can come from a PTE, PMD or PUD. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
*/
static inline bool __pte_access_permitted(unsigned long pteval, bool write) /*covered*/
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
if (write)
need_pte_bits |= _PAGE_RW;
if ((pteval & need_pte_bits) != need_pte_bits)
return 0;
return __pkru_allows_pkey(pte_flags_pkey(pteval), write); /*covered*/
}
#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
return __pte_access_permitted(pte_val(pte), write); /*covered*/
}
#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
return __pte_access_permitted(pmd_val(pmd), write); /*covered*/
}
#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
return __pte_access_permitted(pud_val(pud), write);
}
#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
static inline bool arch_has_pfn_modify_check(void)
{
return boot_cpu_has_bug(X86_BUG_L1TF);
}
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H
#include <linux/const.h>
#include <asm/pgtable_64_types.h>
#ifndef __ASSEMBLY__
/*
* This file contains the functions and defines necessary to modify and use
* the x86-64 page table tree.
*/
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
#include <asm/fixmap.h>
extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];
#define swapper_pg_dir init_top_pgt
extern void paging_init(void);
static inline void sync_initial_page_table(void) { }
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e) \
pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e))
#if CONFIG_PGTABLE_LEVELS >= 5
#define p4d_ERROR(e) \
pr_err("%s:%d: bad p4d %p(%016lx)\n", \
__FILE__, __LINE__, &(e), p4d_val(e))
#endif
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
struct mm_struct;
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
WRITE_ONCE(*ptep, pte);
}
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
native_set_pte(ptep, native_make_pte(0));
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
native_set_pte(ptep, pte);
}
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
WRITE_ONCE(*pmdp, pmd);
}
static inline void native_pmd_clear(pmd_t *pmd)
{
native_set_pmd(pmd, native_make_pmd(0));
}
static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
return native_make_pte(xchg(&xp->pte, 0)); /*covered*/
#else
/* native_local_ptep_get_and_clear,
but duplicated because of cyclic dependency */
pte_t ret = *xp;
native_pte_clear(NULL, 0, xp);
return ret;
#endif
}
static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
return native_make_pmd(xchg(&xp->pmd, 0)); /*covered*/
#else
/* native_local_pmdp_get_and_clear,
but duplicated because of cyclic dependency */
pmd_t ret = *xp;
native_pmd_clear(xp);
return ret;
#endif
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
WRITE_ONCE(*pudp, pud);
}
static inline void native_pud_clear(pud_t *pud)
{
native_set_pud(pud, native_make_pud(0));
}
static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_SMP
return native_make_pud(xchg(&xp->pud, 0));
#else
/* native_local_pudp_get_and_clear,
* but duplicated because of cyclic dependency
*/
pud_t ret = *xp;
native_pud_clear(xp);
return ret;
#endif
}
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
pgd_t pgd;
if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
WRITE_ONCE(*p4dp, p4d);
return;
}
pgd = native_make_pgd(native_p4d_val(p4d));
pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}
static inline void native_p4d_clear(p4d_t *p4d)
{
native_set_p4d(p4d, native_make_p4d(0));
}
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}
static inline void native_pgd_clear(pgd_t *pgd)
{
native_set_pgd(pgd, native_make_pgd(0));
}
extern void sync_global_pgds(unsigned long start, unsigned long end);
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*/
/*
* Level 4 access.
*/
#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
/* PUD - Level3 access */
/* PMD - Level 2 access */
/* PTE - Level 1 access. */
/* x86-64 always has all page tables mapped. */
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte))/* NOP */
/*
* Encode and de-code a swap entry
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
* | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
* there. We also need to avoid using A and D because of an
* erratum where they can be incorrectly set by hardware on
* non-present PTEs.
*
* SD (1) in swp entry is used to store soft dirty bit, which helps us
* remember soft dirty over page migration
*
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
*
* The offset is inverted by a binary not operation to make the high
* physical bits set.
*/
#define SWP_TYPE_BITS 5
#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
/*
* Shift the offset up "too far" by TYPE bits, then down again
* The offset is inverted by a binary not operation to make the high
* physical bits set.
*/
#define __swp_entry(type, offset) ((swp_entry_t) { \
(~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
| ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
#define pgtable_cache_init() do { } while (0)
#define check_pgt_cache() do { } while (0)
#define PAGE_AGP PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1
/* fs/proc/kcore.c */
#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
#define __HAVE_ARCH_PTE_SAME
#define vmemmap ((struct page *)VMEMMAP_START)
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
int write)
{
unsigned long len, end;
len = (unsigned long)nr_pages << PAGE_SHIFT;
end = start + len;
if (end < start) /*covered*/
return false;
if (end >> __VIRTUAL_MASK_SHIFT) /*covered*/
return false;
return true;
}
#include <asm/pgtable-invert.h>
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_DEFS_H
#define _ASM_X86_PGTABLE_DEFS_H
#include <linux/const.h>
#include <linux/mem_encrypt.h>
#include <asm/page_types.h>
#define FIRST_USER_ADDRESS 0UL
#define _PAGE_BIT_PRESENT 0 /* is present */
#define _PAGE_BIT_RW 1 /* writeable */
#define _PAGE_BIT_USER 2 /* userspace addressable */
#define _PAGE_BIT_PWT 3 /* page write through */
#define _PAGE_BIT_PCD 4 /* page cache disabled */
#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
#else
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
#endif
#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
_PAGE_PKEY_BIT1 | \
_PAGE_PKEY_BIT2 | \
_PAGE_PKEY_BIT3)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
#endif
/*
* Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict
* with swap entry format. On x86 bits 1-4 are *not* involved
* into swap entry computation, but bit 7 is used for thp migration,
* so we borrow bit 1 for soft dirty tracking.
*
* Please note that this bit must be treated as swap dirty page
* mark if and only if the PTE/PMD has present bit clear!
*/
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
#endif
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
#define __HAVE_ARCH_PTE_DEVMAP
#else
#define _PAGE_NX (_AT(pteval_t, 0))
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
_PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
_PAGE_ACCESSED | _PAGE_DIRTY)
/*
* Set of bits not changed in pte_modify. The pte's
* protection key is treated like _PAGE_RW, for
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
_PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
* The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries.
*
* The resulting bits for PWT, PCD and PAT should be chosen in a way
* to have the WB mode at index 0 (all bits clear). This is the default
* right now and likely would break too much if changed.
*/
#ifndef __ASSEMBLY__
enum page_cache_mode {
_PAGE_CACHE_MODE_WB = 0,
_PAGE_CACHE_MODE_WC = 1,
_PAGE_CACHE_MODE_UC_MINUS = 2,
_PAGE_CACHE_MODE_UC = 3,
_PAGE_CACHE_MODE_WT = 4,
_PAGE_CACHE_MODE_WP = 5,
_PAGE_CACHE_MODE_NUM = 8
};
#endif
#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_NX)
#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
_PAGE_USER | _PAGE_ACCESSED)
#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_NX)
#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
_PAGE_ACCESSED)
#define PAGE_COPY PAGE_COPY_NOEXEC
#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_NX)
#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
_PAGE_ACCESSED)
#define __PAGE_KERNEL_EXEC \
(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE)
#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP)
#define __PAGE_KERNEL_IO (__PAGE_KERNEL)
#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
#ifndef __ASSEMBLY__
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
_PAGE_DIRTY | _PAGE_ENC)
#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)
#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL)
#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP)
#define default_pgprot(x) __pgprot((x) & __default_kernel_pte_mask)
#define PAGE_KERNEL default_pgprot(__PAGE_KERNEL | _PAGE_ENC)
#define PAGE_KERNEL_NOENC default_pgprot(__PAGE_KERNEL)
#define PAGE_KERNEL_RO default_pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
#define PAGE_KERNEL_EXEC default_pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
#define PAGE_KERNEL_EXEC_NOENC default_pgprot(__PAGE_KERNEL_EXEC)
#define PAGE_KERNEL_RX default_pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
#define PAGE_KERNEL_NOCACHE default_pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
#define PAGE_KERNEL_LARGE default_pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
#define PAGE_KERNEL_LARGE_EXEC default_pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
#define PAGE_KERNEL_VVAR default_pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
#define PAGE_KERNEL_IO default_pgprot(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE default_pgprot(__PAGE_KERNEL_IO_NOCACHE)
#endif /* __ASSEMBLY__ */
/* xwr */
#define __P000 PAGE_NONE
#define __P001 PAGE_READONLY
#define __P010 PAGE_COPY
#define __P011 PAGE_COPY
#define __P100 PAGE_READONLY_EXEC
#define __P101 PAGE_READONLY_EXEC
#define __P110 PAGE_COPY_EXEC
#define __P111 PAGE_COPY_EXEC
#define __S000 PAGE_NONE
#define __S001 PAGE_READONLY
#define __S010 PAGE_SHARED
#define __S011 PAGE_SHARED
#define __S100 PAGE_READONLY_EXEC
#define __S101 PAGE_READONLY_EXEC
#define __S110 PAGE_SHARED_EXEC
#define __S111 PAGE_SHARED_EXEC
/*
* early identity mapping pte attrib macros.
*/
#ifdef CONFIG_X86_64
#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
#else
#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */
#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
#endif
#ifdef CONFIG_X86_32
# include <asm/pgtable_32_types.h>
#else
# include <asm/pgtable_64_types.h>
#endif
#ifndef __ASSEMBLY__
#include <linux/types.h>
/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
/*
* Extracts the flags from a (pte|pmd|pud|pgd)val_t
* This includes the protection key value.
*/
#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
typedef struct { pgdval_t pgd; } pgd_t;
#ifdef CONFIG_X86_PAE
/*
* PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
* use it here.
*/
#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK)
#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)
/*
* PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
* All other bits are Reserved MBZ
*/
#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
_PAGE_PWT | _PAGE_PCD | \
_PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)
#else
/* No need to mask any bits for !PAE */
#define PGD_ALLOWED_BITS (~0ULL)
#endif
static inline pgd_t native_make_pgd(pgdval_t val)
{
return (pgd_t) { val & PGD_ALLOWED_BITS };
}
static inline pgdval_t native_pgd_val(pgd_t pgd)
{
return pgd.pgd & PGD_ALLOWED_BITS;
}
static inline pgdval_t pgd_flags(pgd_t pgd)
{
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}
#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;
static inline p4d_t native_make_p4d(pudval_t val)
{
return (p4d_t) { val };
}
static inline p4dval_t native_p4d_val(p4d_t p4d)
{
return p4d.p4d;
}
#else
#include <asm-generic/pgtable-nop4d.h>
static inline p4d_t native_make_p4d(pudval_t val)
{
return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
}
static inline p4dval_t native_p4d_val(p4d_t p4d)
{
return native_pgd_val(p4d.pgd);
}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
static inline pud_t native_make_pud(pmdval_t val)
{
return (pud_t) { val };
}
static inline pudval_t native_pud_val(pud_t pud)
{
return pud.pud;
}
#else
#include <asm-generic/pgtable-nopud.h>
static inline pud_t native_make_pud(pudval_t val)
{
return (pud_t) { .p4d.pgd = native_make_pgd(val) };
}
static inline pudval_t native_pud_val(pud_t pud)
{
return native_pgd_val(pud.p4d.pgd);
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
static inline pmd_t native_make_pmd(pmdval_t val)
{
return (pmd_t) { val };
}
static inline pmdval_t native_pmd_val(pmd_t pmd)
{
return pmd.pmd;
}
#else
#include <asm-generic/pgtable-nopmd.h>
static inline pmd_t native_make_pmd(pmdval_t val)
{
return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
}
static inline pmdval_t native_pmd_val(pmd_t pmd)
{
return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif
static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
{
/* No 512 GiB huge pages yet */
return PTE_PFN_MASK;
}
static inline p4dval_t p4d_flags_mask(p4d_t p4d)
{
return ~p4d_pfn_mask(p4d);
}
static inline p4dval_t p4d_flags(p4d_t p4d)
{
return native_p4d_val(p4d) & p4d_flags_mask(p4d);
}
static inline pudval_t pud_pfn_mask(pud_t pud)
{
if (native_pud_val(pud) & _PAGE_PSE) /*covered*/
return PHYSICAL_PUD_PAGE_MASK;
else
return PTE_PFN_MASK;
}
static inline pudval_t pud_flags_mask(pud_t pud)
{
return ~pud_pfn_mask(pud); /*covered*/
}
static inline pudval_t pud_flags(pud_t pud)
{
return native_pud_val(pud) & pud_flags_mask(pud); /*covered*/
}
static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
{
if (native_pmd_val(pmd) & _PAGE_PSE) /*covered*/
return PHYSICAL_PMD_PAGE_MASK;
else
return PTE_PFN_MASK;
}
static inline pmdval_t pmd_flags_mask(pmd_t pmd)
{
return ~pmd_pfn_mask(pmd); /*covered*/
}
static inline pmdval_t pmd_flags(pmd_t pmd)
{
return native_pmd_val(pmd) & pmd_flags_mask(pmd); /*covered*/
}
static inline pte_t native_make_pte(pteval_t val)
{
return (pte_t) { .pte = val };
}
static inline pteval_t native_pte_val(pte_t pte)
{
return pte.pte;
}
static inline pteval_t pte_flags(pte_t pte)
{
return native_pte_val(pte) & PTE_FLAGS_MASK; /*covered*/
}
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM];
extern uint8_t __pte2cachemode_tbl[8];
#define __pte2cm_idx(cb) \
((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \
(((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \
(((cb) >> _PAGE_BIT_PWT) & 1))
#define __cm_idx2pte(i) \
((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \
(((i) & 2) << (_PAGE_BIT_PCD - 1)) | \
(((i) & 1) << _PAGE_BIT_PWT))
static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
{
if (likely(pcm == 0))
return 0;
return __cachemode2pte_tbl[pcm];
}
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
return __pgprot(cachemode2protval(pcm));
}
static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
{
unsigned long masked;
masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; /*covered*/
if (likely(masked == 0))
return 0;
return __pte2cachemode_tbl[__pte2cm_idx(masked)];
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
pgprotval_t val = pgprot_val(pgprot);
pgprot_t new;
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
return new;
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
pgprotval_t val = pgprot_val(pgprot);
pgprot_t new;
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | /*covered*/
((val & _PAGE_PAT_LARGE) >>
(_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
return new;
}
typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
extern void set_nx(void);
extern int nx_enabled;
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
#define pgprot_writethrough pgprot_writethrough
extern pgprot_t pgprot_writethrough(pgprot_t prot);
/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING
#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot);
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
#define native_pagetable_init paging_init
#endif
struct seq_file;
extern void arch_report_meminfo(struct seq_file *m);
enum pg_level {
PG_LEVEL_NONE,
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
PG_LEVEL_512G,
PG_LEVEL_NUM
};
#ifdef CONFIG_PROC_FS
extern void update_page_count(int level, unsigned long pages);
#else
static inline void update_page_count(int level, unsigned long pages) { }
#endif
/*
* Helper function that returns the kernel pagetable entry controlling
* the virtual address 'address'. NULL means no pagetable entry present.
* NOTE: the return type is pte_t but if the pmd is PSE then we return it
* as a pte too.
*/
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
unsigned long address,
unsigned numpages,
unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
unsigned long numpages);
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_DEFS_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H
#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>
DECLARE_PER_CPU(int, __preempt_count);
/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED 0x80000000
/*
* We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
* that a decrement hitting 0 means we can and should reschedule.
*/
#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
/*
* We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
* that think a non-zero value indicates we cannot preempt.
*/
static __always_inline int preempt_count(void)
{
return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; /*covered*/
}
static __always_inline void preempt_count_set(int pc)
{
int old, new;
do {
old = raw_cpu_read_4(__preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
} while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
}
/*
* must be macros to avoid header recursion hell
*/
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
} while (0)
/*
* We fold the NEED_RESCHED bit into the preempt count such that
* preempt_enable() can decrement and test for needing to reschedule with a
* single instruction.
*
* We invert the actual bit, so that when the decrement hits 0 we know we both
* need to resched (the bit is cleared) and can resched (no preempt count).
*/
static __always_inline void set_preempt_need_resched(void)
{
raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}
static __always_inline void clear_preempt_need_resched(void)
{
raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}
static __always_inline bool test_preempt_need_resched(void)
{
return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); /*covered*/
}
/*
* The various preempt_count add/sub methods
*/
static __always_inline void __preempt_count_add(int val)
{
raw_cpu_add_4(__preempt_count, val); /*covered*/
}
static __always_inline void __preempt_count_sub(int val)
{
raw_cpu_add_4(__preempt_count, -val);
}
/*
* Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
* a decrement which hits zero means we have no preempt_count and should
* reschedule.
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}
/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(int preempt_offset)
{
return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}
#ifdef CONFIG_PREEMPT
extern asmlinkage void ___preempt_schedule(void);
# define __preempt_schedule() \
asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT)
extern asmlinkage void preempt_schedule(void);
extern asmlinkage void ___preempt_schedule_notrace(void);
# define __preempt_schedule_notrace() \
asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT)
extern asmlinkage void preempt_schedule_notrace(void);
#endif
#endif /* __ASM_PREEMPT_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PROCESSOR_H
#define _ASM_X86_PROCESSOR_H
#include <asm/processor-flags.h>
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
struct vm86;
#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
#include <linux/personality.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/err.h>
#include <linux/irqflags.h>
#include <linux/mem_encrypt.h>
/*
* We handle most unaligned accesses in hardware. On the other hand
* unaligned DMA can be quite expensive on some Nehalem processors.
*
* Based on this we disable the IP header alignment in network drivers.
*/
#define NET_IP_ALIGN 0
#define HBP_NUM 4
/*
* These alignment constraints are for performance in the vSMP case,
* but in the task_struct case we must also meet hardware imposed
* alignment requirements of the FPU state:
*/
#ifdef CONFIG_X86_VSMP
# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
#else
# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state)
# define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
enum tlb_infos {
ENTRIES,
NR_INFO
};
extern u16 __read_mostly tlb_lli_4k[NR_INFO];
extern u16 __read_mostly tlb_lli_2m[NR_INFO];
extern u16 __read_mostly tlb_lli_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_4k[NR_INFO];
extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
* Members of this structure are referenced in head_32.S, so think twice
* before touching them. [mj]
*/
struct cpuinfo_x86 {
__u8 x86; /* CPU family */
__u8 x86_vendor; /* CPU vendor */
__u8 x86_model;
__u8 x86_stepping;
#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
/* CPUID returned core id bits: */
__u8 x86_coreid_bits;
__u8 cu_id;
/* Max extended CPUID function supported: */
__u32 extended_cpuid_level;
/* Maximum supported CPUID level, -1=no CPUID: */
int cpuid_level;
__u32 x86_capability[NCAPINTS + NBUGINTS];
char x86_vendor_id[16];
char x86_model_id[64];
/* in KB - valid for CPUS which support this call: */
unsigned int x86_cache_size;
int x86_cache_alignment; /* In bytes */
/* Cache QoS architectural values: */
int x86_cache_max_rmid; /* max index */
int x86_cache_occ_scale; /* scale to bytes */
int x86_power;
unsigned long loops_per_jiffy;
/* cpuid returned max cores value: */
u16 x86_max_cores;
u16 apicid;
u16 initial_apicid;
u16 x86_clflush_size;
/* number of cores as seen by the OS: */
u16 booted_cores;
/* Physical processor id: */
u16 phys_proc_id;
/* Logical processor id: */
u16 logical_proc_id;
/* Core id: */
u16 cpu_core_id;
/* Index into per_cpu list: */
u16 cpu_index;
u32 microcode;
/* Address space bits used by the cache internally */
u8 x86_cache_bits;
unsigned initialized : 1;
} __randomize_layout;
struct cpuid_regs {
u32 eax, ebx, ecx, edx;
};
enum cpuid_regs_idx {
CPUID_EAX = 0,
CPUID_EBX,
CPUID_ECX,
CPUID_EDX,
};
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
#define X86_VENDOR_AMD 2
#define X86_VENDOR_UMC 3
#define X86_VENDOR_CENTAUR 5
#define X86_VENDOR_TRANSMETA 7
#define X86_VENDOR_NSC 8
#define X86_VENDOR_HYGON 9
#define X86_VENDOR_NUM 10
#define X86_VENDOR_UNKNOWN 0xff
/*
* capabilities of CPUs
*/
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
extern struct x86_hw_tss doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
#else
#define cpu_info boot_cpu_data
#define cpu_data(cpu) boot_cpu_data
#endif
extern const struct seq_operations cpuinfo_op;
#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
extern void cpu_detect(struct cpuinfo_x86 *c);
static inline unsigned long long l1tf_pfn_limit(void)
{
return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT);
}
extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
#ifdef CONFIG_X86_32
extern int have_cpuid_p(void);
#else
static inline int have_cpuid_p(void)
{
return 1;
}
#endif
static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
/* ecx is often an input as well as an output. */
asm volatile("cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (*eax), "2" (*ecx)
: "memory");
}
#define native_cpuid_reg(reg) \
static inline unsigned int native_cpuid_##reg(unsigned int op) \
{ \
unsigned int eax = op, ebx, ecx = 0, edx; \
\
native_cpuid(&eax, &ebx, &ecx, &edx); \
\
return reg; \
}
/*
* Native CPUID functions returning a single datum.
*/
native_cpuid_reg(eax)
native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)
/*
* Friendlier CR3 helpers.
*/
static inline unsigned long read_cr3_pa(void)
{
return __read_cr3() & CR3_ADDR_MASK; /*covered*/
}
static inline unsigned long native_read_cr3_pa(void)
{
return __native_read_cr3() & CR3_ADDR_MASK;
}
static inline void load_cr3(pgd_t *pgdir)
{
write_cr3(__sme_pa(pgdir));
}
/*
* Note that while the legacy 'TSS' name comes from 'Task State Segment',
* on modern x86 CPUs the TSS also holds information important to 64-bit mode,
* unrelated to the task-switch mechanism:
*/
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
unsigned short back_link, __blh;
unsigned long sp0;
unsigned short ss0, __ss0h;
unsigned long sp1;
/*
* We don't use ring 1, so ss1 is a convenient scratch space in
* the same cacheline as sp0. We use ss1 to cache the value in
* MSR_IA32_SYSENTER_CS. When we context switch
* MSR_IA32_SYSENTER_CS, we first check if the new value being
* written matches ss1, and, if it's not, then we wrmsr the new
* value and update ss1.
*
* The only reason we context switch MSR_IA32_SYSENTER_CS is
* that we set it to zero in vm86 tasks to avoid corrupting the
* stack if we were to go through the sysenter path from vm86
* mode.
*/
unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
unsigned short __ss1h;
unsigned long sp2;
unsigned short ss2, __ss2h;
unsigned long __cr3;
unsigned long ip;
unsigned long flags;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long bx;
unsigned long sp;
unsigned long bp;
unsigned long si;
unsigned long di;
unsigned short es, __esh;
unsigned short cs, __csh;
unsigned short ss, __ssh;
unsigned short ds, __dsh;
unsigned short fs, __fsh;
unsigned short gs, __gsh;
unsigned short ldt, __ldth;
unsigned short trace;
unsigned short io_bitmap_base;
} __attribute__((packed));
#else
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
/*
* We store cpu_current_top_of_stack in sp1 so it's always accessible.
* Linux does not use ring 1, so sp1 is not otherwise needed.
*/
u64 sp1;
/*
* Since Linux does not use ring 2, the 'sp2' slot is unused by
* hardware. entry_SYSCALL_64 uses it as scratch space to stash
* the user RSP value.
*/
u64 sp2;
u64 reserved2;
u64 ist[7];
u32 reserved3;
u32 reserved4;
u16 reserved5;
u16 io_bitmap_base;
} __attribute__((packed));
#endif
/*
* IO-bitmap sizes:
*/
#define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000
struct entry_stack {
unsigned long words[64];
};
struct entry_stack_page {
struct entry_stack stack;
} __aligned(PAGE_SIZE);
struct tss_struct {
/*
* The fixed hardware portion. This must not cross a page boundary
* at risk of violating the SDM's advice and potentially triggering
* errata.
*/
struct x86_hw_tss x86_tss;
/*
* The extra 1 is there because the CPU will access an
* additional byte beyond the end of the IO permission
* bitmap. The extra byte must be all 1 bits, and must
* be within the limit.
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
} __aligned(PAGE_SIZE);
DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* sizeof(unsigned long) coming from an extra "long" at the end
* of the iobitmap.
*
* -1? seg base+limit should be pointing to the address of the
* last valid byte
*/
#define __KERNEL_TSS_LIMIT \
(IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif
/*
* Save the original ist values for checking stack pointers during debugging
*/
struct orig_ist {
unsigned long ist[7];
};
#ifdef CONFIG_X86_64
DECLARE_PER_CPU(struct orig_ist, orig_ist);
union irq_stack_union {
char irq_stack[IRQ_STACK_SIZE];
/*
* GCC hardcodes the stack canary as %gs:40. Since the
* irq_stack is the object at %gs:0, we reserve the bottom
* 48 bytes of the irq stack for the canary.
*/
struct {
char gs_base[40];
unsigned long stack_canary;
};
};
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
DECLARE_INIT_PER_CPU(irq_stack_union);
static inline unsigned long cpu_kernelmode_gs_base(int cpu)
{
return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu);
}
DECLARE_PER_CPU(char *, irq_stack_ptr);
DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void);
#if IS_ENABLED(CONFIG_KVM)
/* Save actual FS/GS selectors and bases to current->thread */
void save_fsgs_for_kvm(void);
#endif
#else /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
/*
* Make sure stack canary segment base is cached-aligned:
* "For Intel Atom processors, avoid non zero segment base address
* that is not aligned to cache line boundary at all cost."
* (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
*/
struct stack_canary {
char __pad[20]; /* canary at %gs:20 */
unsigned long canary;
};
DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
/*
* per-CPU IRQ handling stacks
*/
struct irq_stack {
u32 stack[THREAD_SIZE/sizeof(u32)];
} __aligned(THREAD_SIZE);
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
#endif /* X86_64 */
extern unsigned int fpu_kernel_xstate_size;
extern unsigned int fpu_user_xstate_size;
struct perf_event;
typedef struct {
unsigned long seg;
} mm_segment_t;
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
#ifdef CONFIG_X86_32
unsigned long sp0;
#endif
unsigned long sp;
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
#else
unsigned short es;
unsigned short ds;
unsigned short fsindex;
unsigned short gsindex;
#endif
#ifdef CONFIG_X86_64
unsigned long fsbase;
unsigned long gsbase;
#else
/*
* XXX: this could presumably be unsigned short. Alternatively,
* 32-bit kernels could be taught to use fsindex instead.
*/
unsigned long fs;
unsigned long gs;
#endif
/* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */
unsigned long debugreg6;
/* Keep track of the exact dr7 value set by the user */
unsigned long ptrace_dr7;
/* Fault info: */
unsigned long cr2;
unsigned long trap_nr;
unsigned long error_code;
#ifdef CONFIG_VM86
/* Virtual 86 mode info */
struct vm86 *vm86;
#endif
/* IO permissions: */
unsigned long *io_bitmap_ptr;
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
mm_segment_t addr_limit;
unsigned int sig_on_uaccess_err:1;
unsigned int uaccess_err:1; /* uaccess failed */
/* Floating point and extended processor state */
struct fpu fpu;
/*
* WARNING: 'fpu' is dynamically-sized. It *MUST* be at
* the end.
*/
};
/* Whitelist the FPU state from the task_struct for hardened usercopy. */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
unsigned long *size)
{
*offset = offsetof(struct thread_struct, fpu.state);
*size = fpu_kernel_xstate_size;
}
/*
* Thread-synchronous status.
*
* This is different from the flags in that nobody else
* ever touches our thread-synchronous status, so we don't
* have to worry about atomic accesses.
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
/*
* Set IOPL bits in EFLAGS from given mask
*/
static inline void native_set_iopl_mask(unsigned mask)
{
#ifdef CONFIG_X86_32
unsigned int reg;
asm volatile ("pushfl;"
"popl %0;"
"andl %1, %0;"
"orl %2, %0;"
"pushl %0;"
"popfl"
: "=&r" (reg)
: "i" (~X86_EFLAGS_IOPL), "r" (mask));
#endif
}
static inline void
native_load_sp0(unsigned long sp0)
{
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
{
#ifdef CONFIG_X86_64
asm volatile("swapgs" ::: "memory");
#endif
}
static inline unsigned long current_top_of_stack(void)
{
/*
* We can't read directly from tss.sp0: sp0 on x86_32 is special in
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
return this_cpu_read_stable(cpu_current_top_of_stack);
}
static inline bool on_thread_stack(void)
{
return (unsigned long)(current_top_of_stack() -
current_stack_pointer) < THREAD_SIZE;
}
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define __cpuid native_cpuid
static inline void load_sp0(unsigned long sp0)
{
native_load_sp0(sp0);
}
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT_XXL */
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
unsigned long get_wchan(struct task_struct *p);
/*
* Generic CPUID function
* clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
* resulting in stale register contents being returned.
*/
static inline void cpuid(unsigned int op,
unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
*eax = op;
*ecx = 0;
__cpuid(eax, ebx, ecx, edx);
}
/* Some CPUID calls want 'count' to be placed in ecx */
static inline void cpuid_count(unsigned int op, int count,
unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
*eax = op;
*ecx = count;
__cpuid(eax, ebx, ecx, edx);
}
/*
* CPUID functions returning a single datum
*/
static inline unsigned int cpuid_eax(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return eax;
}
static inline unsigned int cpuid_ebx(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return ebx;
}
static inline unsigned int cpuid_ecx(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return ecx;
}
static inline unsigned int cpuid_edx(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return edx;
}
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static __always_inline void rep_nop(void)
{
asm volatile("rep; nop" ::: "memory"); /*covered*/
}
static __always_inline void cpu_relax(void)
{
rep_nop(); /*covered*/
}
/*
* This function forces the icache and prefetched instruction stream to
* catch up with reality in two very specific cases:
*
* a) Text was modified using one virtual address and is about to be executed
* from the same physical page at a different virtual address.
*
* b) Text was modified on a different CPU, may subsequently be
* executed on this CPU, and you want to make sure the new version
* gets executed. This generally means you're calling this in a IPI.
*
* If you're calling this for a different reason, you're probably doing
* it wrong.
*/
static inline void sync_core(void)
{
/*
* There are quite a few ways to do this. IRET-to-self is nice
* because it works on every CPU, at any CPL (so it's compatible
* with paravirtualization), and it never exits to a hypervisor.
* The only down sides are that it's a bit slow (it seems to be
* a bit more than 2x slower than the fastest options) and that
* it unmasks NMIs. The "push %cs" is needed because, in
* paravirtual environments, __KERNEL_CS may not be a valid CS
* value when we do IRET directly.
*
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
* unconditional jump. That sequence also works on all CPUs,
* but it will fault at CPL3 (i.e. Xen PV).
*
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
* hypervisor.
*
* Like all of Linux's memory ordering operations, this is a
* compiler barrier as well.
*/
#ifdef CONFIG_X86_32
asm volatile (
"pushfl\n\t"
"pushl %%cs\n\t"
"pushl $1f\n\t"
"iret\n\t"
"1:"
: ASM_CALL_CONSTRAINT : : "memory");
#else
unsigned int tmp;
asm volatile (
UNWIND_HINT_SAVE
"mov %%ss, %0\n\t"
"pushq %q0\n\t"
"pushq %%rsp\n\t"
"addq $8, (%%rsp)\n\t"
"pushfq\n\t"
"mov %%cs, %0\n\t"
"pushq %q0\n\t"
"pushq $1f\n\t"
"iretq\n\t"
UNWIND_HINT_RESTORE
"1:"
: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
#endif
}
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void amd_e400_c1e_apic_setup(void);
extern unsigned long boot_option_idle_override;
enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
IDLE_POLL};
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
void early_trap_pf_init(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
extern void switch_to_new_gdt(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
static inline unsigned long get_debugctlmsr(void)
{
unsigned long debugctlmsr = 0;
#ifndef CONFIG_X86_DEBUGCTLMSR
if (boot_cpu_data.x86 < 6)
return 0;
#endif
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
return debugctlmsr;
}
static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
if (boot_cpu_data.x86 < 6)
return;
#endif
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}
extern void set_task_blockstep(struct task_struct *task, bool on);
/* Boot loader type from the setup header: */
extern int bootloader_type;
extern int bootloader_version;
extern char ignore_fpu_irq;
#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
#define ARCH_HAS_PREFETCHW
#define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32
# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
# define BASE_PREFETCH "prefetcht0 %P1"
#endif
/*
* Prefetch instructions for Pentium III (+) and AMD Athlon (+)
*
* It's not worth to care about 3dnow prefetches for the K6
* because they are microcoded there and very slow.
*/
static inline void prefetch(const void *x)
{
alternative_input(BASE_PREFETCH, "prefetchnta %P1",
X86_FEATURE_XMM,
"m" (*(const char *)x));
}
/*
* 3dnow prefetch to get an exclusive cache line.
* Useful for spinlocks to avoid one state transition in the
* cache coherency protocol:
*/
static inline void prefetchw(const void *x)
{
alternative_input(BASE_PREFETCH, "prefetchw %P1", /*covered*/
X86_FEATURE_3DNOWPREFETCH,
"m" (*(const char *)x));
}
static inline void spin_lock_prefetch(const void *x)
{
prefetchw(x);
}
#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
TOP_OF_KERNEL_STACK_PADDING)
#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
#define task_pt_regs(task) \
({ \
unsigned long __ptr = (unsigned long)task_stack_page(task); \
__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
((struct pt_regs *)__ptr) - 1; \
})
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
*/
#define IA32_PAGE_OFFSET PAGE_OFFSET
#define TASK_SIZE PAGE_OFFSET
#define TASK_SIZE_LOW TASK_SIZE
#define TASK_SIZE_MAX TASK_SIZE
#define DEFAULT_MAP_WINDOW TASK_SIZE
#define STACK_TOP TASK_SIZE
#define STACK_TOP_MAX STACK_TOP
#define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
.addr_limit = KERNEL_DS, \
}
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
/*
* User space process size. This is the first address outside the user range.
* There are a few constraints that determine this:
*
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
* We avoid this particular problem by preventing anything executable
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
0xc0000000 : 0xFFFFe000)
#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
#define STACK_TOP TASK_SIZE_LOW
#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
.addr_limit = KERNEL_DS, \
}
extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp);
/*
* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
#define KSTK_EIP(task) (task_pt_regs(task)->ip)
/* Get/set a process' ability to use the timestamp counter instruction */
#define GET_TSC_CTL(adr) get_tsc_mode((adr))
#define SET_TSC_CTL(val) set_tsc_mode((val))
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);
DECLARE_PER_CPU(u64, msr_misc_features_shadow);
/* Register/unregister a process' MPX related resource */
#define MPX_ENABLE_MANAGEMENT() mpx_enable_management()
#define MPX_DISABLE_MANAGEMENT() mpx_disable_management()
#ifdef CONFIG_X86_INTEL_MPX
extern int mpx_enable_management(void);
extern int mpx_disable_management(void);
#else
static inline int mpx_enable_management(void)
{
return -EINVAL;
}
static inline int mpx_disable_management(void)
{
return -EINVAL;
}
#endif /* CONFIG_X86_INTEL_MPX */
#ifdef CONFIG_CPU_SUP_AMD
extern u16 amd_get_nb_id(int cpu);
extern u32 amd_get_nodes_per_socket(void);
#else
static inline u16 amd_get_nb_id(int cpu) { return 0; }
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
#endif
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
uint32_t base, eax, signature[3];
for (base = 0x40000000; base < 0x40010000; base += 0x100) {
cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
if (!memcmp(sig, signature, 12) &&
(leaves == 0 || ((eax - base) >= leaves)))
return base;
}
return 0;
}
extern unsigned long arch_align_stack(unsigned long sp);
void free_init_pages(const char *what, unsigned long begin, unsigned long end);
extern void free_kernel_image_pages(void *begin, void *end);
void default_idle(void);
#ifdef CONFIG_XEN
bool xen_set_default_idle(void);
#else
#define xen_set_default_idle 0
#endif
void stop_this_cpu(void *dummy);
void df_debug(struct pt_regs *regs, long error_code);
void microcode_check(void);
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
L1TF_MITIGATION_FLUSH_NOWARN,
L1TF_MITIGATION_FLUSH,
L1TF_MITIGATION_FLUSH_NOSMT,
L1TF_MITIGATION_FULL,
L1TF_MITIGATION_FULL_FORCE
};
extern enum l1tf_mitigations l1tf_mitigation;
#endif /* _ASM_X86_PROCESSOR_H */
/*
* Supervisor Mode Access Prevention support
*
* Copyright (C) 2012 Intel Corporation
* Author: H. Peter Anvin <hpa@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#ifndef _ASM_X86_SMAP_H
#define _ASM_X86_SMAP_H
#include <linux/stringify.h>
#include <asm/nops.h>
#include <asm/cpufeatures.h>
/* "Raw" instruction opcodes */
#define __ASM_CLAC .byte 0x0f,0x01,0xca
#define __ASM_STAC .byte 0x0f,0x01,0xcb
#ifdef __ASSEMBLY__
#include <asm/alternative-asm.h>
#ifdef CONFIG_X86_SMAP
#define ASM_CLAC \
ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
#define ASM_STAC \
ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
#else /* CONFIG_X86_SMAP */
#define ASM_CLAC
#define ASM_STAC
#endif /* CONFIG_X86_SMAP */
#else /* __ASSEMBLY__ */
#include <asm/alternative.h>
#ifdef CONFIG_X86_SMAP
static __always_inline void clac(void)
{
/* Note: a barrier is implicit in alternative() */
alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); /*covered*/
}
static __always_inline void stac(void)
{
/* Note: a barrier is implicit in alternative() */
alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); /*covered*/
}
/* These macros can be used in asm() statements */
#define ASM_CLAC \
ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
#define ASM_STAC \
ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
#else /* CONFIG_X86_SMAP */
static inline void clac(void) { }
static inline void stac(void) { }
#define ASM_CLAC
#define ASM_STAC
#endif /* CONFIG_X86_SMAP */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMAP_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SMP_H
#define _ASM_X86_SMP_H
#ifndef __ASSEMBLY__
#include <linux/cpumask.h>
#include <asm/percpu.h>
/*
* We need the APIC definitions automatically as part of 'smp.h'
*/
#ifdef CONFIG_X86_LOCAL_APIC
# include <asm/mpspec.h>
# include <asm/apic.h>
# ifdef CONFIG_X86_IO_APIC
# include <asm/io_apic.h>
# endif
#endif
#include <asm/thread_info.h>
#include <asm/cpumask.h>
extern int smp_num_siblings;
extern unsigned int num_processors;
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
{
return per_cpu(cpu_llc_shared_map, cpu);
}
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
#endif
struct task_struct;
struct smp_ops {
void (*smp_prepare_boot_cpu)(void);
void (*smp_prepare_cpus)(unsigned max_cpus);
void (*smp_cpus_done)(unsigned max_cpus);
void (*stop_other_cpus)(int wait);
void (*crash_stop_other_cpus)(void);
void (*smp_send_reschedule)(int cpu);
int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
int (*cpu_disable)(void);
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
};
/* Globals due to paravirt */
extern void set_cpu_sibling_map(int cpu);
#ifdef CONFIG_SMP
extern struct smp_ops smp_ops;
static inline void smp_send_stop(void)
{
smp_ops.stop_other_cpus(0);
}
static inline void stop_other_cpus(void)
{
smp_ops.stop_other_cpus(1);
}
static inline void smp_prepare_boot_cpu(void)
{
smp_ops.smp_prepare_boot_cpu();
}
static inline void smp_prepare_cpus(unsigned int max_cpus)
{
smp_ops.smp_prepare_cpus(max_cpus);
}
static inline void smp_cpus_done(unsigned int max_cpus)
{
smp_ops.smp_cpus_done(max_cpus);
}
static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
return smp_ops.cpu_up(cpu, tidle);
}
static inline int __cpu_disable(void)
{
return smp_ops.cpu_disable();
}
static inline void __cpu_die(unsigned int cpu)
{
smp_ops.cpu_die(cpu);
}
static inline void play_dead(void)
{
smp_ops.play_dead();
}
static inline void smp_send_reschedule(int cpu)
{
smp_ops.smp_send_reschedule(cpu);
}
static inline void arch_send_call_function_single_ipi(int cpu)
{
smp_ops.send_call_func_single_ipi(cpu); /*covered*/
}
static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
smp_ops.send_call_func_ipi(mask);
}
void cpu_disable_common(void);
void native_smp_prepare_boot_cpu(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void calculate_max_logical_packages(void);
void native_smp_cpus_done(unsigned int max_cpus);
void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_disable(void);
int common_cpu_die(unsigned int cpu);
void native_cpu_die(unsigned int cpu);
void hlt_play_dead(void);
void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
void smp_store_boot_cpu_info(void);
void smp_store_cpu_info(int id);
asmlinkage __visible void smp_reboot_interrupt(void);
__visible void smp_reschedule_interrupt(struct pt_regs *regs);
__visible void smp_call_function_interrupt(struct pt_regs *regs);
__visible void smp_call_function_single_interrupt(struct pt_regs *r);
#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
#define cpu_acpi_id(cpu) per_cpu(x86_cpu_to_acpiid, cpu)
/*
* This function is needed by all SMP systems. It must _always_ be valid
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
#define raw_smp_processor_id() (this_cpu_read(cpu_number))
#ifdef CONFIG_X86_32
extern int safe_smp_processor_id(void);
#else
# define safe_smp_processor_id() smp_processor_id()
#endif
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
static inline int wbinvd_on_all_cpus(void)
{
wbinvd();
return 0;
}
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus;
#ifdef CONFIG_X86_LOCAL_APIC
extern int hard_smp_processor_id(void);
#else /* CONFIG_X86_LOCAL_APIC */
#define hard_smp_processor_id() 0
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_DEBUG_NMI_SELFTEST
extern void nmi_selftest(void);
#else
#define nmi_selftest() do { } while (0)
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMP_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*/
#ifndef _ASM_X86_STACKTRACE_H
#define _ASM_X86_STACKTRACE_H
#include <linux/uaccess.h>
#include <linux/ptrace.h>
#include <asm/switch_to.h>
enum stack_type {
STACK_TYPE_UNKNOWN,
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ,
STACK_TYPE_ENTRY,
STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};
struct stack_info {
enum stack_type type;
unsigned long *begin, *end, *next_sp;
};
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
bool in_entry_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);
const char *stack_type_name(enum stack_type type);
static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
{
void *begin = info->begin;
void *end = info->end;
return (info->type != STACK_TYPE_UNKNOWN &&
addr >= begin && addr < end &&
addr + len > begin && addr + len <= end);
}
#ifdef CONFIG_X86_32
#define STACKSLOTS_PER_LINE 8
#else
#define STACKSLOTS_PER_LINE 4
#endif
#ifdef CONFIG_FRAME_POINTER
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
if (regs)
return (unsigned long *)regs->bp;
if (task == current)
return __builtin_frame_address(0);
return &((struct inactive_task_frame *)task->thread.sp)->bp;
}
#else
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
return NULL;
}
#endif /* CONFIG_FRAME_POINTER */
static inline unsigned long *
get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
{
if (regs)
return (unsigned long *)kernel_stack_pointer(regs); /*covered*/
if (task == current)
return __builtin_frame_address(0);
return (unsigned long *)task->thread.sp;
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl);
/* The form of the top of the frame on the stack */
struct stack_frame {
struct stack_frame *next_frame;
unsigned long return_address;
};
struct stack_frame_ia32 {
u32 next_frame;
u32 return_address;
};
static inline unsigned long caller_frame_pointer(void)
{
struct stack_frame *frame;
frame = __builtin_frame_address(0);
#ifdef CONFIG_FRAME_POINTER
frame = frame->next_frame;
#endif
return (unsigned long)frame;
}
void show_opcodes(struct pt_regs *regs, const char *loglvl);
void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */
/*
* Access to user system call parameters and results
*
* Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU General Public License v.2.
*
* See asm-generic/syscall.h for descriptions of what we must do here.
*/
#ifndef _ASM_X86_SYSCALL_H
#define _ASM_X86_SYSCALL_H
#include <uapi/linux/audit.h>
#include <linux/sched.h>
#include <linux/err.h>
#include <asm/asm-offsets.h> /* For NR_syscalls */
#include <asm/thread_info.h> /* for TS_COMPAT */
#include <asm/unistd.h>
#ifdef CONFIG_X86_64
typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *);
#else
typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
#endif /* CONFIG_X86_64 */
extern const sys_call_ptr_t sys_call_table[];
#if defined(CONFIG_X86_32)
#define ia32_sys_call_table sys_call_table
#define __NR_syscall_compat_max __NR_syscall_max
#define IA32_NR_syscalls NR_syscalls
#endif
#if defined(CONFIG_IA32_EMULATION)
extern const sys_call_ptr_t ia32_sys_call_table[];
#endif
/*
* Only the low 32 bits of orig_ax are meaningful, so we return int.
* This importantly ignores the high bits on 64-bit, so comparisons
* sign-extend the low 32 bits.
*/
static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
{
return regs->orig_ax;
}
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
regs->ax = regs->orig_ax;
}
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
unsigned long error = regs->ax;
#ifdef CONFIG_IA32_EMULATION
/*
* TS_COMPAT is set for 32-bit syscall entries and then
* remains set until we return to user mode.
*/
if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
/*
* Sign-extend the value so (int)-EFOO becomes (long)-EFOO
* and will match correctly in comparisons.
*/
error = (long) (int) error;
#endif
return IS_ERR_VALUE(error) ? error : 0;
}
static inline long syscall_get_return_value(struct task_struct *task,
struct pt_regs *regs)
{
return regs->ax;
}
static inline void syscall_set_return_value(struct task_struct *task,
struct pt_regs *regs,
int error, long val)
{
regs->ax = (long) error ?: val;
}
#ifdef CONFIG_X86_32
static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
unsigned int i, unsigned int n,
unsigned long *args)
{
BUG_ON(i + n > 6);
memcpy(args, ®s->bx + i, n * sizeof(args[0]));
}
static inline void syscall_set_arguments(struct task_struct *task,
struct pt_regs *regs,
unsigned int i, unsigned int n,
const unsigned long *args)
{
BUG_ON(i + n > 6);
memcpy(®s->bx + i, args, n * sizeof(args[0]));
}
static inline int syscall_get_arch(void)
{
return AUDIT_ARCH_I386;
}
#else /* CONFIG_X86_64 */
static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
unsigned int i, unsigned int n,
unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
if (task->thread_info.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
*args++ = regs->bx;
case 1:
if (!n--) break;
*args++ = regs->cx;
case 2:
if (!n--) break;
*args++ = regs->dx;
case 3:
if (!n--) break;
*args++ = regs->si;
case 4:
if (!n--) break;
*args++ = regs->di;
case 5:
if (!n--) break;
*args++ = regs->bp;
case 6:
if (!n--) break;
default:
BUG();
break;
}
else
# endif
switch (i) {
case 0:
if (!n--) break;
*args++ = regs->di; /*covered*/
case 1:
if (!n--) break;
*args++ = regs->si;
case 2:
if (!n--) break;
*args++ = regs->dx;
case 3:
if (!n--) break;
*args++ = regs->r10;
case 4:
if (!n--) break;
*args++ = regs->r8;
case 5:
if (!n--) break;
*args++ = regs->r9;
case 6:
if (!n--) break;
default:
BUG();
break;
}
}
static inline void syscall_set_arguments(struct task_struct *task,
struct pt_regs *regs,
unsigned int i, unsigned int n,
const unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
if (task->thread_info.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
regs->bx = *args++;
case 1:
if (!n--) break;
regs->cx = *args++;
case 2:
if (!n--) break;
regs->dx = *args++;
case 3:
if (!n--) break;
regs->si = *args++;
case 4:
if (!n--) break;
regs->di = *args++;
case 5:
if (!n--) break;
regs->bp = *args++;
case 6:
if (!n--) break;
default:
BUG();
break;
}
else
# endif
switch (i) {
case 0:
if (!n--) break;
regs->di = *args++;
case 1:
if (!n--) break;
regs->si = *args++;
case 2:
if (!n--) break;
regs->dx = *args++;
case 3:
if (!n--) break;
regs->r10 = *args++;
case 4:
if (!n--) break;
regs->r8 = *args++;
case 5:
if (!n--) break;
regs->r9 = *args++;
case 6:
if (!n--) break;
default:
BUG();
break;
}
}
static inline int syscall_get_arch(void)
{
/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
#endif /* CONFIG_X86_32 */
#endif /* _ASM_X86_SYSCALL_H */
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: low-level thread information
*
* Copyright (C) 2002 David Howells (dhowells@redhat.com)
* - Incorporating suggestions made by Linus Torvalds and Dave Miller
*/
#ifndef _ASM_X86_THREAD_INFO_H
#define _ASM_X86_THREAD_INFO_H
#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/percpu.h>
#include <asm/types.h>
/*
* TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
* reserve at the top of the kernel stack. We do it because of a nasty
* 32-bit corner case. On x86_32, the hardware stack frame is
* variable-length. Except for vm86 mode, struct pt_regs assumes a
* maximum-length frame. If we enter from CPL 0, the top 8 bytes of
* pt_regs don't actually exist. Ordinarily this doesn't matter, but it
* does in at least one case:
*
* If we take an NMI early enough in SYSENTER, then we can end up with
* pt_regs that extends above sp0. On the way out, in the espfix code,
* we can read the saved SS value, but that value will be above sp0.
* Without this offset, that can result in a page fault. (We are
* careful that, in this case, the value we read doesn't matter.)
*
* In vm86 mode, the hardware frame is much longer still, so add 16
* bytes to make room for the real-mode segments.
*
* x86_64 has a fixed-length stack frame.
*/
#ifdef CONFIG_X86_32
# ifdef CONFIG_VM86
# define TOP_OF_KERNEL_STACK_PADDING 16
# else
# define TOP_OF_KERNEL_STACK_PADDING 8
# endif
#else
# define TOP_OF_KERNEL_STACK_PADDING 0
#endif
/*
* low level task data that entry.S needs immediate access to
* - this struct should fit entirely inside of one cache line
* - this struct shares the supervisor stack pages
*/
#ifndef __ASSEMBLY__
struct task_struct;
#include <asm/cpufeature.h>
#include <linux/atomic.h>
struct thread_info {
unsigned long flags; /* low level flags */
u32 status; /* thread synchronous flags */
};
#define INIT_THREAD_INFO(tsk) \
{ \
.flags = 0, \
}
#else /* !__ASSEMBLY__ */
#include <asm/asm-offsets.h>
#endif
/*
* thread information flags
* - these are process state flags that various assembly files
* may need to access
*/
#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
#define TIF_SSBD 5 /* Speculative store bypass disable */
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
#define _TIF_SSBD (1 << TIF_SSBD)
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_NOHZ (1 << TIF_NOHZ)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
#define _TIF_FSCHECK (1 << TIF_FSCHECK)
/*
* work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
* enter_from_user_mode()
*/
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
_TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
_TIF_NOHZ)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE \
(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \
_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
/*
* Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
*/
#ifdef CONFIG_SMP
# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
#else
# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
#endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
#define STACK_WARN (THREAD_SIZE/8)
/*
* macros/functions for gaining access to the thread information structure
*
* preempt_count needs to be 1 initially, until the scheduler is functional.
*/
#ifndef __ASSEMBLY__
/*
* Walks up the stack frames to make sure that the specified object is
* entirely contained by a single stack frame.
*
* Returns:
* GOOD_FRAME if within a frame
* BAD_STACK if placed across a frame boundary (or outside stack)
* NOT_STACK unable to determine (no frame pointers, etc)
*/
static inline int arch_within_stack_frames(const void * const stack,
const void * const stackend,
const void *obj, unsigned long len)
{
#if defined(CONFIG_FRAME_POINTER)
const void *frame = NULL;
const void *oldframe;
oldframe = __builtin_frame_address(1); /*covered*/
if (oldframe)
frame = __builtin_frame_address(2); /*covered*/
/*
* low ----------------------------------------------> high
* [saved bp][saved ip][args][local vars][saved bp][saved ip]
* ^----------------^
* allow copies only within here
*/
while (stack <= frame && frame < stackend) { /*covered*/
/*
* If obj + len extends past the last frame, this
* check won't pass and the next frame will be 0,
* causing us to bail out and correctly report
* the copy as invalid.
*/
if (obj + len <= frame) /*covered*/
return obj >= oldframe + 2 * sizeof(void *) ? /*covered*/
GOOD_FRAME : BAD_STACK;
oldframe = frame;
frame = *(const void * const *)frame; /*covered*/
}
return BAD_STACK;
#else
return NOT_STACK;
#endif
}
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif
#endif
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
#endif
#ifndef __ASSEMBLY__
#ifdef CONFIG_X86_32
#define in_ia32_syscall() true
#else
#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
current_thread_info()->status & TS_COMPAT)
#endif
/*
* Force syscall return via IRET by making it look as if there was
* some work pending. IRET is our most capable (but slowest) syscall
* return path, which is able to restore modified SS, CS and certain
* EFLAGS values that other (fast) syscall return instructions
* are not able to restore properly.
*/
#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
extern void arch_setup_new_exec(void);
#define arch_setup_new_exec arch_setup_new_exec
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_THREAD_INFO_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLBFLUSH_H
#define _ASM_X86_TLBFLUSH_H
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>
/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
* to what is traditionally called ASID on the RISC processors.
*
* We don't use the traditional ASID implementation, where each process/mm gets
* its own ASID and flush/restart when we run out of ASID space.
*
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
* that came by on this CPU, allowing cheaper switch_mm between processes on
* this CPU.
*
* We end up with different spaces for different things. To avoid confusion we
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
* the canonical identifier for an mm
*
* kPCID - [1, TLB_NR_DYN_ASIDS]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
*
*/
/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12
/*
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
#endif
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
/*
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
* for them being zero-based. Another -1 is because PCID 0 is reserved for
* use by non-PCID-aware users.
*/
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
* lines.
*/
#define TLB_NR_DYN_ASIDS 6
/*
* Given @asid, compute kPCID
*/
static inline u16 kern_pcid(u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not confict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
/*
* The ASID being passed in here should have respected the
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
*/
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
#endif
/*
* The dynamically-assigned ASIDs that get passed in are small
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
* so do not bother to clear it.
*
* If PCID is on, ASID-aware code paths put the ASID+1 into the
* PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other
* value (with PCID == 0), and then restores CR3, thus corrupting
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
* that any bugs involving loading a PCID-enabled CR3 with
* CR4.PCIDE off will trigger deterministically.
*/
return asid + 1;
}
/*
* Given @asid, compute uPCID
*/
static inline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
#endif
return ret;
}
struct pgd_t;
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{
if (static_cpu_has(X86_FEATURE_PCID)) {
return __sme_pa(pgd) | kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
return __sme_pa(pgd);
}
}
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
* Use boot_cpu_has() instead of this_cpu_has() as this function
* might be called during early boot. This should work even after
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
}
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
#define __flush_tlb() __native_flush_tlb()
#define __flush_tlb_global() __native_flush_tlb_global()
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
};
struct tlb_state {
/*
* cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
* are on. This means that it may not match current->active_mm,
* which will contain the previous user mm when we're in lazy TLB
* mode even if we've already switched back to swapper_pg_dir.
*
* During switch_mm_irqs_off(), loaded_mm will be set to
* LOADED_MM_SWITCHING during the brief interrupts-off window
* when CR3 and loaded_mm would otherwise be inconsistent. This
* is for nmi_uaccess_okay()'s benefit.
*/
struct mm_struct *loaded_mm;
#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
unsigned long last_user_mm_ibpb;
};
u16 loaded_mm_asid;
u16 next_asid;
/*
* We can be in one of several states:
*
* - Actively using an mm. Our CPU's bit will be set in
* mm_cpumask(loaded_mm) and is_lazy == false;
*
* - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
* will not be set in mm_cpumask(&init_mm) and is_lazy == false.
*
* - Lazily using a real mm. loaded_mm != &init_mm, our bit
* is set in mm_cpumask(loaded_mm), but is_lazy == true.
* We're heuristically guessing that the CR3 load we
* skipped more than makes up for the overhead added by
* lazy mode.
*/
bool is_lazy;
/*
* If set we changed the page tables in such a way that we
* needed an invalidation of all contexts (aka. PCIDs / ASIDs).
* This tells us to go invalidate all the non-loaded ctxs[]
* on the next context switch.
*
* The current ctx was kept up-to-date as it ran and does not
* need to be invalidated.
*/
bool invalidate_other;
/*
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
* the corresponding user PCID needs a flush next time we
* switch to it; see SWITCH_TO_USER_CR3.
*/
unsigned short user_pcid_flush_mask;
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
*/
unsigned long cr4;
/*
* This is a list of all contexts that might exist in the TLB.
* There is one per ASID that we use, and the ASID (what the
* CPU calls PCID) is the index into ctxts.
*
* For each context, ctx_id indicates which mm the TLB's user
* entries came from. As an invariant, the TLB will never
* contain entries that are out-of-date as when that mm reached
* the tlb_gen in the list.
*
* To be clear, this means that it's legal for the TLB code to
* flush the TLB without updating tlb_gen. This can happen
* (for now, at least) due to paravirt remote flushes.
*
* NB: context 0 is a bit special, since it's also used by
* various bits of init code. This is fine -- code that
* isn't aware of PCID will end up harmlessly flushing
* context 0.
*/
struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or
* switching the loaded mm. It can also be dangerous if we
* interrupted some kernel code that was temporarily using a
* different mm.
*/
static inline bool nmi_uaccess_okay(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); /*covered*/
struct mm_struct *current_mm = current->mm;
VM_WARN_ON_ONCE(!loaded_mm); /*covered*/
/*
* The condition we want to check is
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
* is supposed to be reasonably fast.
*
* Instead, we check the almost equivalent but somewhat conservative
* condition below, and we rely on the fact that switch_mm_irqs_off()
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
*/
if (loaded_mm != current_mm) /*covered*/
return false;
VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); /*covered*/
return true;
}
/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{
this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
}
static inline void __cr4_set(unsigned long cr4)
{
lockdep_assert_irqs_disabled();
this_cpu_write(cpu_tlbstate.cr4, cr4);
__write_cr4(cr4);
}
/* Set in this cpu's CR4. */
static inline void cr4_set_bits(unsigned long mask)
{
unsigned long cr4, flags;
local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 | mask) != cr4)
__cr4_set(cr4 | mask);
local_irq_restore(flags);
}
/* Clear in this cpu's CR4. */
static inline void cr4_clear_bits(unsigned long mask)
{
unsigned long cr4, flags;
local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 & ~mask) != cr4)
__cr4_set(cr4 & ~mask);
local_irq_restore(flags);
}
static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
{
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
__cr4_set(cr4 ^ mask);
}
/* Read the CR4 shadow. */
static inline unsigned long cr4_read_shadow(void)
{
return this_cpu_read(cpu_tlbstate.cr4);
}
/*
* Mark all other ASIDs as invalid, preserves the current.
*/
static inline void invalidate_other_asid(void)
{
this_cpu_write(cpu_tlbstate.invalidate_other, true);
}
/*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot
* up after us can get the correct flags. This should only be used
* during boot on the boot cpu.
*/
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
static inline void cr4_set_bits_and_update_boot(unsigned long mask)
{
mmu_cr4_features |= mask;
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
cr4_set_bits(mask);
}
extern void initialize_tlbstate_and_flush(void);
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
* See SWITCH_TO_USER_CR3.
*/
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
return;
/*
* We only have a single ASID if PCID is off and the CR3
* write will have flushed it.
*/
if (!cpu_feature_enabled(X86_FEATURE_PCID))
return;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
__set_bit(kern_pcid(asid),
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
/*
* flush the entire current user mapping
*/
static inline void __native_flush_tlb(void)
{
/*
* Preemption or interrupts must be disabled to protect the access
* to the per CPU variable and to prevent being preempted between
* read_cr3() and write_cr3().
*/
WARN_ON_ONCE(preemptible());
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
native_write_cr3(__native_read_cr3());
}
/*
* flush everything
*/
static inline void __native_flush_tlb_global(void)
{
unsigned long cr4, flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
*
* Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
}
/*
* Read-modify-write to CR4 - protect it from preemption and
* from interrupts. (Use the raw variant because this code can
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* toggle PGE */
native_write_cr4(cr4 ^ X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
raw_local_irq_restore(flags);
}
/*
* flush one page in the user mapping
*/
static inline void __native_flush_tlb_one_user(unsigned long addr)
{
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
* Just use invalidate_user_asid() in case we are called early.
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
invalidate_user_asid(loaded_mm_asid);
else
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
}
/*
* flush everything
*/
static inline void __flush_tlb_all(void)
{
/*
* This is to catch users with enabled preemption and the PGE feature
* and don't trigger the warning in __native_flush_tlb().
*/
VM_WARN_ON_ONCE(preemptible());
if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else {
/*
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
*/
__flush_tlb();
}
}
/*
* flush one page in the kernel mapping
*/
static inline void __flush_tlb_one_kernel(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
/*
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
* paravirt equivalent. Even with PCID, this is sufficient: we only
* use PCID if we also use global PTEs for the kernel mapping, and
* INVLPG flushes global translations across all address spaces.
*
* If PTI is on, then the kernel is mapped with non-global PTEs, and
* __flush_tlb_one_user() will flush the given address for the current
* kernel address space and for its usermode counterpart, but it does
* not flush it for other address spaces.
*/
__flush_tlb_one_user(addr); /*covered*/
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* See above. We need to propagate the flush to all other address
* spaces. In principle, we only need to propagate it to kernelmode
* address spaces, but the extra bookkeeping we would need is not
* worth it.
*/
invalidate_other_asid();
}
#define TLB_FLUSH_ALL -1UL
/*
* TLB flushing:
*
* - flush_tlb_all() flushes all processes TLBs
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
* - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
*/
struct flush_tlb_info {
/*
* We support several kinds of flushes.
*
* - Fully flush a single mm. .mm will be set, .end will be
* TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
* which the IPI sender is trying to catch us up.
*
* - Partially flush a single mm. .mm will be set, .start and
* .end will indicate the range, and .new_tlb_gen will be set
* such that the changes between generation .new_tlb_gen-1 and
* .new_tlb_gen are entirely contained in the indicated range.
*
* - Fully flush all mms whose tlb_gens have been updated. .mm
* will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
* will be zero.
*/
struct mm_struct *mm;
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
unsigned int stride_shift;
bool freed_tables;
};
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) \
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
: PAGE_SHIFT, false)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
/*
* Bump the generation count. This also serves as a full barrier
* that synchronizes with switch_mm(): callers are required to order
* their read of mm_cpumask after their writes to the paging
* structures.
*/
return atomic64_inc_return(&mm->context.tlb_gen);
}
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{
inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
}
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, info) \
native_flush_tlb_others(mask, info)
#define paravirt_tlb_remove_table(tlb, page) \
tlb_remove_page(tlb, (void *)(page))
#endif
#endif /* _ASM_X86_TLBFLUSH_H */
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM exceptions
#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGE_FAULT_H
#include <linux/tracepoint.h>
#include <asm/trace/common.h>
extern int trace_pagefault_reg(void);
extern void trace_pagefault_unreg(void);
DECLARE_EVENT_CLASS(x86_exceptions,
TP_PROTO(unsigned long address, struct pt_regs *regs,
unsigned long error_code),
TP_ARGS(address, regs, error_code),
TP_STRUCT__entry(
__field( unsigned long, address )
__field( unsigned long, ip )
__field( unsigned long, error_code )
),
TP_fast_assign(
__entry->address = address;
__entry->ip = regs->ip;
__entry->error_code = error_code;
),
TP_printk("address=%pf ip=%pf error_code=0x%lx",
(void *)__entry->address, (void *)__entry->ip,
__entry->error_code) );
#define DEFINE_PAGE_FAULT_EVENT(name) \
DEFINE_EVENT_FN(x86_exceptions, name, \
TP_PROTO(unsigned long address, struct pt_regs *regs, \
unsigned long error_code), \
TP_ARGS(address, regs, error_code), \
trace_pagefault_reg, trace_pagefault_unreg);
DEFINE_PAGE_FAULT_EVENT(page_fault_user);
DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); /*covered*/
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE exceptions
#endif /* _TRACE_PAGE_FAULT_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM exceptions
#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGE_FAULT_H
#include <linux/tracepoint.h>
#include <asm/trace/common.h>
extern int trace_pagefault_reg(void);
extern void trace_pagefault_unreg(void);
DECLARE_EVENT_CLASS(x86_exceptions, /*covered*/
TP_PROTO(unsigned long address, struct pt_regs *regs,
unsigned long error_code),
TP_ARGS(address, regs, error_code),
TP_STRUCT__entry(
__field( unsigned long, address )
__field( unsigned long, ip )
__field( unsigned long, error_code )
),
TP_fast_assign(
__entry->address = address;
__entry->ip = regs->ip;
__entry->error_code = error_code;
),
TP_printk("address=%pf ip=%pf error_code=0x%lx",
(void *)__entry->address, (void *)__entry->ip,
__entry->error_code) );
#define DEFINE_PAGE_FAULT_EVENT(name) \
DEFINE_EVENT_FN(x86_exceptions, name, \
TP_PROTO(unsigned long address, struct pt_regs *regs, \
unsigned long error_code), \
TP_ARGS(address, regs, error_code), \
trace_pagefault_reg, trace_pagefault_unreg);
DEFINE_PAGE_FAULT_EVENT(page_fault_user);
DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE exceptions
#endif /* _TRACE_PAGE_FAULT_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
/* SPDX-License-Identifier: GPL-2.0 */
/*
* x86 TSC related functions
*/
#ifndef _ASM_X86_TSC_H
#define _ASM_X86_TSC_H
#include <asm/processor.h>
#define NS_SCALE 10 /* 2^10, carefully chosen */
#define US_SCALE 32 /* 2^32, arbitralrily chosen */
/*
* Standard way to access the cycle counter.
*/
typedef unsigned long long cycles_t;
extern unsigned int cpu_khz;
extern unsigned int tsc_khz;
extern void disable_TSC(void);
static inline cycles_t get_cycles(void)
{
#ifndef CONFIG_X86_TSC
if (!boot_cpu_has(X86_FEATURE_TSC))
return 0;
#endif
return rdtsc(); /*covered*/
}
extern struct system_counterval_t convert_art_to_tsc(u64 art);
extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns);
extern void tsc_early_init(void);
extern void tsc_init(void);
extern unsigned long calibrate_delay_is_known(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern void mark_tsc_async_resets(char *reason);
extern unsigned long native_calibrate_cpu_early(void);
extern unsigned long native_calibrate_tsc(void);
extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
extern int tsc_clocksource_reliable;
#ifdef CONFIG_X86_TSC
extern bool tsc_async_resets;
#else
# define tsc_async_resets false
#endif
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
*/
#ifdef CONFIG_X86_TSC
extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
extern void tsc_verify_tsc_adjust(bool resume);
extern void check_tsc_sync_source(int cpu);
extern void check_tsc_sync_target(void);
#else
static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; }
static inline void tsc_verify_tsc_adjust(bool resume) { }
static inline void check_tsc_sync_source(int cpu) { }
static inline void check_tsc_sync_target(void) { }
#endif
extern int notsc_setup(char *);
extern void tsc_save_sched_clock_state(void);
extern void tsc_restore_sched_clock_state(void);
unsigned long cpu_khz_from_msr(void);
#endif /* _ASM_X86_TSC_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
* User space memory access functions
*/
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
/*
* The fs value determines whether argument validity checking should be
* performed or not. If get_fs() == USER_DS, checking is performed, with
* get_fs() == KERNEL_DS, checking is bypassed.
*
* For historical reasons, these macros are grossly misnamed.
*/
#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
#define KERNEL_DS MAKE_MM_SEG(-1UL)
#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
#define get_ds() (KERNEL_DS)
#define get_fs() (current->thread.addr_limit)
static inline void set_fs(mm_segment_t fs)
{
current->thread.addr_limit = fs;
/* On user-mode return, check fs is correct */
set_thread_flag(TIF_FSCHECK);
}
#define segment_eq(a, b) ((a).seg == (b).seg)
#define user_addr_max() (current->thread.addr_limit.seg)
#define __addr_ok(addr) \
((unsigned long __force)(addr) < user_addr_max())
/*
* Test whether a block of memory is a valid user space address.
* Returns 0 if the range is valid, nonzero otherwise.
*/
static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
{
/*
* If we have used "sizeof()" for the size,
* we know it won't overflow the limit (but
* it might overflow the 'addr', so it's
* important to subtract the size from the
* limit, not add it to the address).
*/
if (__builtin_constant_p(size))
return unlikely(addr > limit - size);
/* Arbitrary sizes? Be careful about overflow */
addr += size;
if (unlikely(addr < size))
return true;
return unlikely(addr > limit);
}
#define __range_not_ok(addr, size, limit) \
({ \
__chk_user_ptr(addr); \
__chk_range_not_ok((unsigned long __force)(addr), size, limit); \
})
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task())
#else
# define WARN_ON_IN_IRQ()
#endif
/**
* access_ok: - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
* @size: Size of block to check
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* Checks if a pointer to a block of memory in user space is valid.
*
* Returns true (nonzero) if the memory block may be valid, false (zero)
* if it is definitely invalid.
*
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
*/
#define access_ok(addr, size) \
({ \
WARN_ON_IN_IRQ(); \
likely(!__range_not_ok(addr, size, user_addr_max())); \
})
/*
* These are the main single-value transfer routines. They automatically
* use the right size if we just have the right pointer type.
*
* This gets kind of ugly. We want to return _two_ values in "get_user()"
* and yet we don't want to do any pointers, because that is too much
* of a performance impact. Thus we have a few rather ugly macros here,
* and hide all the ugliness from the user.
*
* The "__xxx" versions of the user access functions are versions that
* do not verify the address space, that must have been done previously
* with a separate "access_ok()" call (this is used when we do multiple
* accesses to the same area of user memory).
*/
extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_bad(void);
#define __uaccess_begin() stac()
#define __uaccess_end() clac()
#define __uaccess_begin_nospec() \
({ \
stac(); \
barrier_nospec(); \
})
/*
* This is a type: either unsigned long, if the argument fits into
* that type, or otherwise unsigned long long.
*/
#define __inttype(x) \
__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
/**
* get_user: - Get a simple variable from user space.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple variable from user space to kernel
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
* Returns zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
/*
* Careful: we have to cast the result to the type of the pointer
* for sign reasons.
*
* The use of _ASM_DX as the register specifier is a bit of a
* simplification, as gcc only cares about it as the starting point
* and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
* (%ecx being the next register in gcc's x86 register sequence), and
* %rdx on 64 bits.
*
* Clang/LLVM cares about the size of the register, but still wants
* the base register for something that ends up being a pair.
*/
#define get_user(x, ptr) \
({ \
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
__chk_user_ptr(ptr); \
might_fault(); \
asm volatile("call __get_user_%P4" \
: "=a" (__ret_gu), "=r" (__val_gu), \
ASM_CALL_CONSTRAINT \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \
__builtin_expect(__ret_gu, 0); \
})
#define __put_user_x(size, x, ptr, __ret_pu) \
asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
: "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label) \
asm_volatile_goto("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
_ASM_EXTABLE_UA(1b, %l2) \
_ASM_EXTABLE_UA(2b, %l2) \
: : "A" (x), "r" (addr) \
: : label)
#define __put_user_asm_ex_u64(x, addr) \
asm volatile("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
"3:" \
_ASM_EXTABLE_EX(1b, 2b) \
_ASM_EXTABLE_EX(2b, 3b) \
: : "A" (x), "r" (addr))
#define __put_user_x8(x, ptr, __ret_pu) \
asm volatile("call __put_user_8" : "=a" (__ret_pu) \
: "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
#else
#define __put_user_goto_u64(x, ptr, label) \
__put_user_goto(x, ptr, "q", "", "er", label)
#define __put_user_asm_ex_u64(x, addr) \
__put_user_asm_ex(x, addr, "q", "", "er")
#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
#endif
extern void __put_user_bad(void);
/*
* Strange magic calling convention: pointer in %ecx,
* value in %eax(:%edx), return value in %eax. clobbers %rbx
*/
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
/**
* put_user: - Write a simple value into user space.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple value from kernel space to user
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
* Returns zero on success, or -EFAULT on error.
*/
#define put_user(x, ptr) \
({ \
int __ret_pu; \
__typeof__(*(ptr)) __pu_val; \
__chk_user_ptr(ptr); \
might_fault(); \
__pu_val = x; \
switch (sizeof(*(ptr))) { \
case 1: \
__put_user_x(1, __pu_val, ptr, __ret_pu); \
break; \
case 2: \
__put_user_x(2, __pu_val, ptr, __ret_pu); \
break; \
case 4: \
__put_user_x(4, __pu_val, ptr, __ret_pu); \
break; \
case 8: \
__put_user_x8(__pu_val, ptr, __ret_pu); \
break; \
default: \
__put_user_x(X, __pu_val, ptr, __ret_pu); \
break; \
} \
__builtin_expect(__ret_pu, 0); \
})
#define __put_user_size(x, ptr, size, label) \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
__put_user_goto(x, ptr, "b", "b", "iq", label); \
break; \
case 2: \
__put_user_goto(x, ptr, "w", "w", "ir", label); \
break; \
case 4: \
__put_user_goto(x, ptr, "l", "k", "ir", label); \
break; \
case 8: \
__put_user_goto_u64((__typeof__(*ptr))(x), ptr, label); \
break; \
default: \
__put_user_bad(); \
} \
} while (0)
/*
* This doesn't do __uaccess_begin/end - the exception handling
* around it must do that.
*/
#define __put_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
__put_user_asm_ex(x, ptr, "b", "b", "iq"); \
break; \
case 2: \
__put_user_asm_ex(x, ptr, "w", "w", "ir"); \
break; \
case 4: \
__put_user_asm_ex(x, ptr, "l", "k", "ir"); \
break; \
case 8: \
__put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \
break; \
default: \
__put_user_bad(); \
} \
} while (0)
#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval, errret) \
({ \
__typeof__(ptr) __ptr = (ptr); \
asm volatile("\n" \
"1: movl %2,%%eax\n" \
"2: movl %3,%%edx\n" \
"3:\n" \
".section .fixup,\"ax\"\n" \
"4: mov %4,%0\n" \
" xorl %%eax,%%eax\n" \
" xorl %%edx,%%edx\n" \
" jmp 3b\n" \
".previous\n" \
_ASM_EXTABLE_UA(1b, 4b) \
_ASM_EXTABLE_UA(2b, 4b) \
: "=r" (retval), "=&A"(x) \
: "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \
"i" (errret), "0" (retval)); \
})
#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad()
#else
#define __get_user_asm_u64(x, ptr, retval, errret) \
__get_user_asm(x, ptr, retval, "q", "", "=r", errret)
#define __get_user_asm_ex_u64(x, ptr) \
__get_user_asm_ex(x, ptr, "q", "", "=r")
#endif
#define __get_user_size(x, ptr, size, retval, errret) \
do { \
retval = 0; \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
__get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
break; \
case 2: \
__get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
break; \
case 4: \
__get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
break; \
case 8: \
__get_user_asm_u64(x, ptr, retval, errret); \
break; \
default: \
(x) = __get_user_bad(); \
} \
} while (0)
#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
asm volatile("\n" \
"1: mov"itype" %2,%"rtype"1\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" xor"itype" %"rtype"1,%"rtype"1\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE_UA(1b, 3b) \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret) \
asm volatile("\n" \
"1: mov"itype" %2,%"rtype"1\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE_UA(1b, 3b) \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
/*
* This doesn't do __uaccess_begin/end - the exception handling
* around it must do that.
*/
#define __get_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
__get_user_asm_ex(x, ptr, "b", "b", "=q"); \
break; \
case 2: \
__get_user_asm_ex(x, ptr, "w", "w", "=r"); \
break; \
case 4: \
__get_user_asm_ex(x, ptr, "l", "k", "=r"); \
break; \
case 8: \
__get_user_asm_ex_u64(x, ptr); \
break; \
default: \
(x) = __get_user_bad(); \
} \
} while (0)
#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \
asm volatile("1: mov"itype" %1,%"rtype"0\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3:xor"itype" %"rtype"0,%"rtype"0\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE_EX(1b, 3b) \
: ltype(x) : "m" (__m(addr)))
#define __put_user_nocheck(x, ptr, size) \
({ \
__label__ __pu_label; \
int __pu_err = -EFAULT; \
__uaccess_begin(); \
__put_user_size((x), (ptr), (size), __pu_label); \
__pu_err = 0; \
__pu_label: \
__uaccess_end(); \
__builtin_expect(__pu_err, 0); \
})
#define __get_user_nocheck(x, ptr, size) \
({ \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
__uaccess_begin_nospec(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
__uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
__builtin_expect(__gu_err, 0); \
})
/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))
/*
* Tell gcc we read from memory instead of writing: this is because
* we do not write to any memory gcc knows about, so there are no
* aliasing issues.
*/
#define __put_user_goto(x, addr, itype, rtype, ltype, label) \
asm_volatile_goto("\n" \
"1: mov"itype" %"rtype"0,%1\n" \
_ASM_EXTABLE_UA(1b, %l2) \
: : ltype(x), "m" (__m(addr)) \
: : label)
#define __put_user_failed(x, addr, itype, rtype, ltype, errret) \
({ __label__ __puflab; \
int __pufret = errret; \
__put_user_goto(x,addr,itype,rtype,ltype,__puflab); \
__pufret = 0; \
__puflab: __pufret; })
#define __put_user_asm(x, addr, retval, itype, rtype, ltype, errret) do { \
retval = __put_user_failed(x, addr, itype, rtype, ltype, errret); \
} while (0)
#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \
asm volatile("1: mov"itype" %"rtype"0,%1\n" \
"2:\n" \
_ASM_EXTABLE_EX(1b, 2b) \
: : ltype(x), "m" (__m(addr)))
/*
* uaccess_try and catch
*/
#define uaccess_try do { \
current->thread.uaccess_err = 0; \
__uaccess_begin(); \
barrier();
#define uaccess_try_nospec do { \
current->thread.uaccess_err = 0; \
__uaccess_begin_nospec(); \
#define uaccess_catch(err) \
__uaccess_end(); \
(err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
} while (0)
/**
* __get_user: - Get a simple variable from user space, with less checking.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple variable from user space to kernel
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
* Caller must check the pointer with access_ok() before calling this
* function.
*
* Returns zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
#define __get_user(x, ptr) \
__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
/**
* __put_user: - Write a simple value into user space, with less checking.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple value from kernel space to user
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
* Caller must check the pointer with access_ok() before calling this
* function.
*
* Returns zero on success, or -EFAULT on error.
*/
#define __put_user(x, ptr) \
__put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
/*
* {get|put}_user_try and catch
*
* get_user_try {
* get_user_ex(...);
* } get_user_catch(err)
*/
#define get_user_try uaccess_try_nospec
#define get_user_catch(err) uaccess_catch(err)
#define get_user_ex(x, ptr) do { \
unsigned long __gue_val; \
__get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr)))__gue_val; \
} while (0)
#define put_user_try uaccess_try
#define put_user_catch(err) uaccess_catch(err)
#define put_user_ex(x, ptr) \
__put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
extern unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);
extern __must_check long strnlen_user(const char __user *str, long n);
unsigned long __must_check clear_user(void __user *mem, unsigned long len);
unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
extern void __cmpxchg_wrong_size(void)
__compiletime_error("Bad argument size for cmpxchg");
#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \
({ \
int __ret = 0; \
__typeof__(ptr) __uval = (uval); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
__uaccess_begin_nospec(); \
switch (size) { \
case 1: \
{ \
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
_ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "q" (__new), "1" (__old) \
: "memory" \
); \
break; \
} \
case 2: \
{ \
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
_ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "r" (__new), "1" (__old) \
: "memory" \
); \
break; \
} \
case 4: \
{ \
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
_ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "r" (__new), "1" (__old) \
: "memory" \
); \
break; \
} \
case 8: \
{ \
if (!IS_ENABLED(CONFIG_X86_64)) \
__cmpxchg_wrong_size(); \
\
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
_ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "r" (__new), "1" (__old) \
: "memory" \
); \
break; \
} \
default: \
__cmpxchg_wrong_size(); \
} \
__uaccess_end(); \
*__uval = __old; \
__ret; \
})
#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new) \
({ \
access_ok((ptr), sizeof(*(ptr))) ? \
__user_atomic_cmpxchg_inatomic((uval), (ptr), \
(old), (new), sizeof(*(ptr))) : \
-EFAULT; \
})
/*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
#ifdef CONFIG_X86_INTEL_USERCOPY
extern struct movsl_mask {
int mask;
} ____cacheline_aligned_in_smp movsl_mask;
#endif
#define ARCH_HAS_NOCACHE_UACCESS 1
#ifdef CONFIG_X86_32
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
#endif
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
*
* Caller must use pagefault_enable/disable, or run in interrupt context,
* and also do a uaccess_ok() check
*/
#define __copy_from_user_nmi __copy_from_user_inatomic
/*
* The "unsafe" user accesses aren't really "unsafe", but the naming
* is a big fat warning: you have to not only do the access_ok()
* checking before using them, but you have to surround them with the
* user_access_begin/end() pair.
*/
static __must_check inline bool user_access_begin(const void __user *ptr, size_t len)
{
if (unlikely(!access_ok(ptr,len))) /*covered*/
return 0;
__uaccess_begin_nospec(); /*covered*/
return 1;
}
#define user_access_begin(a,b) user_access_begin(a,b)
#define user_access_end() __uaccess_end()
#define unsafe_put_user(x, ptr, label) \
__put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)
#define unsafe_get_user(x, ptr, err_label) \
do { \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
if (unlikely(__gu_err)) goto err_label; \
} while (0)
#endif /* _ASM_X86_UACCESS_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H
/*
* User space memory access functions
*/
#include <linux/compiler.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
/*
* Copy To/From Userspace
*/
/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len)
{
unsigned ret;
/*
* If CPU has ERMS feature, use copy_user_enhanced_fast_string.
* Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
* Otherwise, use copy_user_generic_unrolled.
*/
alternative_call_2(copy_user_generic_unrolled,
copy_user_generic_string,
X86_FEATURE_REP_GOOD,
copy_user_enhanced_fast_string,
X86_FEATURE_ERMS,
ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
"=d" (len)),
"1" (to), "2" (from), "3" (len)
: "memory", "rcx", "r8", "r9", "r10", "r11");
return ret;
}
static __always_inline __must_check unsigned long
copy_to_user_mcsafe(void *to, const void *from, unsigned len)
{
unsigned long ret;
__uaccess_begin();
/*
* Note, __memcpy_mcsafe() is explicitly used since it can
* handle exceptions / faults. memcpy_mcsafe() may fall back to
* memcpy() which lacks this handling.
*/
ret = __memcpy_mcsafe(to, from, len);
__uaccess_end();
return ret;
}
static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
int ret = 0;
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size); /*covered*/
switch (size) {
case 1:
__uaccess_begin_nospec();
__get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
__uaccess_begin_nospec();
__get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
__uaccess_begin_nospec();
__get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
__uaccess_end();
return ret;
case 8:
__uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
__uaccess_end();
return ret;
case 10:
__uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
if (likely(!ret))
__get_user_asm_nozero(*(u16 *)(8 + (char *)dst),
(u16 __user *)(8 + (char __user *)src),
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 16:
__uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
if (likely(!ret))
__get_user_asm_nozero(*(u64 *)(8 + (char *)dst),
(u64 __user *)(8 + (char __user *)src),
ret, "q", "", "=r", 8);
__uaccess_end();
return ret;
default:
return copy_user_generic(dst, (__force void *)src, size);
}
}
static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
int ret = 0;
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
case 1:
__uaccess_begin();
__put_user_asm(*(u8 *)src, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
__uaccess_end();
return ret;
case 2:
__uaccess_begin();
__put_user_asm(*(u16 *)src, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
__uaccess_end();
return ret;
case 4:
__uaccess_begin();
__put_user_asm(*(u32 *)src, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
__uaccess_end();
return ret;
case 8:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 8);
__uaccess_end();
return ret;
case 10:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 10);
if (likely(!ret)) {
asm("":::"memory");
__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
ret, "w", "w", "ir", 2);
}
__uaccess_end();
return ret;
case 16:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 16);
if (likely(!ret)) {
asm("":::"memory");
__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
ret, "q", "", "er", 8);
}
__uaccess_end();
return ret;
default:
return copy_user_generic((__force void *)dst, src, size);
}
}
static __always_inline __must_check
unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
{
return copy_user_generic((__force void *)dst,
(__force void *)src, size);
}
extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len);
static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
unsigned size)
{
kasan_check_write(dst, size);
return __copy_user_nocache(dst, src, size, 0);
}
static inline int
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
kasan_check_write(dst, size);
return __copy_user_flushcache(dst, src, size);
}
unsigned long
copy_user_handle_tail(char *to, char *from, unsigned len);
unsigned long
mcsafe_handle_tail(char *to, char *from, unsigned len);
#endif /* _ASM_X86_UACCESS_64_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UNWIND_H
#define _ASM_X86_UNWIND_H
#include <linux/sched.h>
#include <linux/ftrace.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
struct unwind_state {
struct stack_info stack_info;
unsigned long stack_mask;
struct task_struct *task;
int graph_idx;
bool error;
#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
unsigned long sp, bp, ip;
struct pt_regs *regs;
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
struct pt_regs *regs;
#else
unsigned long *sp;
#endif
};
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame);
bool unwind_next_frame(struct unwind_state *state);
unsigned long unwind_get_return_address(struct unwind_state *state);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
static inline bool unwind_done(struct unwind_state *state)
{
return state->stack_info.type == STACK_TYPE_UNKNOWN;
}
static inline bool unwind_error(struct unwind_state *state)
{
return state->error;
}
static inline
void unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame)
{
first_frame = first_frame ? : get_stack_pointer(task, regs); /*covered*/
__unwind_start(state, task, regs, first_frame); /*covered*/
}
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
* If 'partial' returns true, only the iret frame registers are valid.
*/
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
bool *partial)
{
if (unwind_done(state))
return NULL;
if (partial) {
#ifdef CONFIG_UNWINDER_ORC
*partial = !state->full_regs;
#else
*partial = false;
#endif
}
return state->regs;
}
#else
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
bool *partial)
{
return NULL;
}
#endif
#ifdef CONFIG_UNWINDER_ORC
void unwind_init(void);
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
void *orc, size_t orc_size);
#else
static inline void unwind_init(void) {}
static inline
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
void *orc, size_t orc_size) {}
#endif
/*
* This disables KASAN checking when reading a value from another task's stack,
* since the other task could be running on another CPU and could have poisoned
* the stack in the meantime.
*/
#define READ_ONCE_TASK_STACK(task, x) \
({ \
unsigned long val; \
if (task == current) \
val = READ_ONCE(x); \
else \
val = READ_ONCE_NOCHECK(x); \
val; \
})
static inline bool task_on_another_cpu(struct task_struct *task)
{
#ifdef CONFIG_SMP
return task != current && task->on_cpu;
#else
return false;
#endif
}
#endif /* _ASM_X86_UNWIND_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_WORD_AT_A_TIME_H
#define _ASM_WORD_AT_A_TIME_H
#include <linux/kernel.h>
/*
* This is largely generic for little-endian machines, but the
* optimal byte mask counting is probably going to be something
* that is architecture-specific. If you have a reliably fast
* bit count instruction, that might be better than the multiply
* and shift, for example.
*/
struct word_at_a_time {
const unsigned long one_bits, high_bits;
};
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
#ifdef CONFIG_64BIT
/*
* Jan Achrenius on G+: microoptimized version of
* the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
* that works for the bytemasks without having to
* mask them first.
*/
static inline long count_masked_bytes(unsigned long mask)
{
return mask*0x0001020304050608ul >> 56;
}
#else /* 32-bit case */
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline long count_masked_bytes(long mask)
{
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
long a = (0x0ff0001+mask) >> 23;
/* Fix the 1 for 00 case */
return a & mask;
}
#endif
/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; /*covered*/
*bits = mask;
return mask;
}
static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
return bits;
}
static inline unsigned long create_zero_mask(unsigned long bits)
{
bits = (bits - 1) & ~bits; /*covered*/
return bits >> 7;
}
/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)
static inline unsigned long find_zero(unsigned long mask)
{
return count_masked_bytes(mask);
}
/*
* Load an unaligned word from kernel space.
*
* In the (very unlikely) case of the word being a page-crosser
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
unsigned long ret, dummy;
asm( /*covered*/
"1:\tmov %2,%0\n"
"2:\n"
".section .fixup,\"ax\"\n"
"3:\t"
"lea %2,%1\n\t"
"and %3,%1\n\t"
"mov (%1),%0\n\t"
"leal %2,%%ecx\n\t"
"andl %4,%%ecx\n\t"
"shll $3,%%ecx\n\t"
"shr %%cl,%0\n\t"
"jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b)
:"=&r" (ret),"=&c" (dummy)
:"m" (*(unsigned long *)addr),
"i" (-sizeof(unsigned long)),
"i" (sizeof(unsigned long)-1));
return ret;
}
#endif /* _ASM_WORD_AT_A_TIME_H */
/*
* Copyright (C) 1994 Linus Torvalds
*
* Pentium III FXSR, SSE support
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
#include <asm/fpu/internal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>
#include <linux/hardirq.h>
#include <linux/pkeys.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
* depending on the FPU hardware format:
*/
union fpregs_state init_fpstate __read_mostly;
/*
* Track whether the kernel is using the FPU state
* currently.
*
* This flag is used:
*
* - by IRQ context code to potentially use the FPU
* if it's unused.
*
* - to debug kernel_fpu_begin()/end() correctness
*/
static DEFINE_PER_CPU(bool, in_kernel_fpu);
/*
* Track which context is using the FPU on the CPU:
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
static void kernel_fpu_disable(void)
{
WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); /*covered*/
this_cpu_write(in_kernel_fpu, true); /*covered*/
}
static void kernel_fpu_enable(void)
{
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); /*covered*/
this_cpu_write(in_kernel_fpu, false); /*covered*/
}
static bool kernel_fpu_disabled(void)
{
return this_cpu_read(in_kernel_fpu); /*covered*/
}
static bool interrupted_kernel_fpu_idle(void)
{
return !kernel_fpu_disabled(); /*covered*/
}
/*
* Were we in user mode (or vm86 mode) when we were
* interrupted?
*
* Doing kernel_fpu_begin/end() is ok if we are running
* in an interrupt context from user mode - we'll just
* save the FPU state as required.
*/
static bool interrupted_user_mode(void)
{
struct pt_regs *regs = get_irq_regs(); /*covered*/
return regs && user_mode(regs);
}
/*
* Can we use the FPU in kernel mode with the
* whole "kernel_fpu_begin/end()" sequence?
*
* It's always ok in process context (ie "not interrupt")
* but it is sometimes ok even from an irq.
*/
bool irq_fpu_usable(void)
{
return !in_interrupt() || /*covered*/
interrupted_user_mode() || /*covered*/
interrupted_kernel_fpu_idle(); /*covered*/
}
EXPORT_SYMBOL(irq_fpu_usable);
static void __kernel_fpu_begin(void)
{
struct fpu *fpu = ¤t->thread.fpu;
WARN_ON_FPU(!irq_fpu_usable()); /*covered*/
kernel_fpu_disable(); /*covered*/
if (fpu->initialized) {
/*
* Ignore return value -- we don't care if reg state
* is clobbered.
*/
copy_fpregs_to_fpstate(fpu); /*covered*/
} else {
__cpu_invalidate_fpregs_state();
}
}
static void __kernel_fpu_end(void)
{
struct fpu *fpu = ¤t->thread.fpu; /*covered*/
if (fpu->initialized)
copy_kernel_to_fpregs(&fpu->state); /*covered*/
kernel_fpu_enable(); /*covered*/
}
void kernel_fpu_begin(void)
{
preempt_disable(); /*covered*/
__kernel_fpu_begin(); /*covered*/
} /*covered*/
EXPORT_SYMBOL_GPL(kernel_fpu_begin);
void kernel_fpu_end(void)
{
__kernel_fpu_end(); /*covered*/
preempt_enable(); /*covered*/
} /*covered*/
EXPORT_SYMBOL_GPL(kernel_fpu_end);
/*
* Save the FPU state (mark it for reload if necessary):
*
* This only ever gets called for the current task.
*/
void fpu__save(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu);
preempt_disable();
trace_x86_fpu_before_save(fpu);
if (fpu->initialized) {
if (!copy_fpregs_to_fpstate(fpu)) {
copy_kernel_to_fpregs(&fpu->state);
}
}
trace_x86_fpu_after_save(fpu);
preempt_enable();
}
EXPORT_SYMBOL_GPL(fpu__save);
/*
* Legacy x87 fpstate state init:
*/
static inline void fpstate_init_fstate(struct fregs_state *fp)
{
fp->cwd = 0xffff037fu;
fp->swd = 0xffff0000u;
fp->twd = 0xffffffffu;
fp->fos = 0xffff0000u;
}
void fpstate_init(union fpregs_state *state)
{
if (!static_cpu_has(X86_FEATURE_FPU)) {
fpstate_init_soft(&state->soft);
return;
}
memset(state, 0, fpu_kernel_xstate_size);
if (static_cpu_has(X86_FEATURE_XSAVES))
fpstate_init_xstate(&state->xsave);
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(&state->fxsave);
else
fpstate_init_fstate(&state->fsave);
}
EXPORT_SYMBOL_GPL(fpstate_init);
int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
{
dst_fpu->last_cpu = -1;
if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU))
return 0;
WARN_ON_FPU(src_fpu != ¤t->thread.fpu);
/*
* Don't let 'init optimized' areas of the XSAVE area
* leak into the child task:
*/
memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
/*
* Save current FPU registers directly into the child
* FPU context, without any memory-to-memory copying.
*
* ( The function 'fails' in the FNSAVE case, which destroys
* register contents so we have to copy them back. )
*/
if (!copy_fpregs_to_fpstate(dst_fpu)) {
memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
copy_kernel_to_fpregs(&src_fpu->state);
}
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
return 0;
}
/*
* Activate the current task's in-memory FPU context,
* if it has not been used before:
*/
void fpu__initialize(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu);
if (!fpu->initialized) {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
trace_x86_fpu_activate_state(fpu);
/* Safe to do for the current task: */
fpu->initialized = 1;
}
}
EXPORT_SYMBOL_GPL(fpu__initialize);
/*
* This function must be called before we read a task's fpstate.
*
* There's two cases where this gets called:
*
* - for the current task (when coredumping), in which case we have
* to save the latest FPU registers into the fpstate,
*
* - or it's called for stopped tasks (ptrace), in which case the
* registers were already saved by the context-switch code when
* the task scheduled out - we only have to initialize the registers
* if they've never been initialized.
*
* If the task has used the FPU before then save it.
*/
void fpu__prepare_read(struct fpu *fpu)
{
if (fpu == ¤t->thread.fpu) {
fpu__save(fpu);
} else {
if (!fpu->initialized) {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
trace_x86_fpu_activate_state(fpu);
/* Safe to do for current and for stopped child tasks: */
fpu->initialized = 1;
}
}
}
/*
* This function must be called before we write a task's fpstate.
*
* If the task has used the FPU before then invalidate any cached FPU registers.
* If the task has not used the FPU before then initialize its fpstate.
*
* After this function call, after registers in the fpstate are
* modified and the child task has woken up, the child task will
* restore the modified FPU state from the modified context. If we
* didn't clear its cached status here then the cached in-registers
* state pending on its former CPU could be restored, corrupting
* the modifications.
*/
void fpu__prepare_write(struct fpu *fpu)
{
/*
* Only stopped child tasks can be used to modify the FPU
* state in the fpstate buffer:
*/
WARN_ON_FPU(fpu == ¤t->thread.fpu);
if (fpu->initialized) {
/* Invalidate any cached state: */
__fpu_invalidate_fpregs_state(fpu);
} else {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
trace_x86_fpu_activate_state(fpu);
/* Safe to do for stopped child tasks: */
fpu->initialized = 1;
}
}
/*
* 'fpu__restore()' is called to copy FPU registers from
* the FPU fpstate to the live hw registers and to activate
* access to the hardware registers, so that FPU instructions
* can be used afterwards.
*
* Must be called with kernel preemption disabled (for example
* with local interrupts disabled, as it is in the case of
* do_device_not_available()).
*/
void fpu__restore(struct fpu *fpu)
{
fpu__initialize(fpu);
/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
kernel_fpu_disable();
trace_x86_fpu_before_restore(fpu);
fpregs_activate(fpu);
copy_kernel_to_fpregs(&fpu->state);
trace_x86_fpu_after_restore(fpu);
kernel_fpu_enable();
}
EXPORT_SYMBOL_GPL(fpu__restore);
/*
* Drops current FPU state: deactivates the fpregs and
* the fpstate. NOTE: it still leaves previous contents
* in the fpregs in the eager-FPU case.
*
* This function can be used in cases where we know that
* a state-restore is coming: either an explicit one,
* or a reschedule.
*/
void fpu__drop(struct fpu *fpu)
{
preempt_disable();
if (fpu == ¤t->thread.fpu) {
if (fpu->initialized) {
/* Ignore delayed exceptions from user space */
asm volatile("1: fwait\n"
"2:\n"
_ASM_EXTABLE(1b, 2b));
fpregs_deactivate(fpu);
}
}
fpu->initialized = 0;
trace_x86_fpu_dropped(fpu);
preempt_enable();
}
/*
* Clear FPU registers by setting them up from
* the init fpstate:
*/
static inline void copy_init_fpstate_to_fpregs(void)
{
if (use_xsave())
copy_kernel_to_xregs(&init_fpstate.xsave, -1);
else if (static_cpu_has(X86_FEATURE_FXSR))
copy_kernel_to_fxregs(&init_fpstate.fxsave);
else
copy_kernel_to_fregs(&init_fpstate.fsave);
if (boot_cpu_has(X86_FEATURE_OSPKE))
copy_init_pkru_to_fpregs();
}
/*
* Clear the FPU state back to init state.
*
* Called by sys_execve(), by the signal handler code and by various
* error paths.
*/
void fpu__clear(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */
fpu__drop(fpu);
/*
* Make sure fpstate is cleared and initialized.
*/
if (static_cpu_has(X86_FEATURE_FPU)) {
preempt_disable();
fpu__initialize(fpu);
user_fpu_begin();
copy_init_fpstate_to_fpregs();
preempt_enable();
}
}
/*
* x87 math exception handling:
*/
int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
int err;
if (trap_nr == X86_TRAP_MF) {
unsigned short cwd, swd;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
* status. 0x3f is the exception bits in these regs, 0x200 is the
* C1 reg you need in case of a stack fault, 0x040 is the stack
* fault bit. We should only be taking one exception at a time,
* so if this combination doesn't produce any single exception,
* then we have a bad program that isn't synchronizing its FPU usage
* and it will suffer the consequences since we won't be able to
* fully reproduce the context of the exception.
*/
if (boot_cpu_has(X86_FEATURE_FXSR)) {
cwd = fpu->state.fxsave.cwd;
swd = fpu->state.fxsave.swd;
} else {
cwd = (unsigned short)fpu->state.fsave.cwd;
swd = (unsigned short)fpu->state.fsave.swd;
}
err = swd & ~cwd;
} else {
/*
* The SIMD FPU exceptions are handled a little differently, as there
* is only a single status/control register. Thus, to determine which
* unmasked exception was caught we must mask the exception mask bits
* at 0x1f80, and then use these to mask the exception bits at 0x3f.
*/
unsigned short mxcsr = MXCSR_DEFAULT;
if (boot_cpu_has(X86_FEATURE_XMM))
mxcsr = fpu->state.fxsave.mxcsr;
err = ~(mxcsr >> 7) & mxcsr;
}
if (err & 0x001) { /* Invalid op */
/*
* swd & 0x240 == 0x040: Stack Underflow
* swd & 0x240 == 0x240: Stack Overflow
* User must clear the SF bit (0x40) if set
*/
return FPE_FLTINV;
} else if (err & 0x004) { /* Divide by Zero */
return FPE_FLTDIV;
} else if (err & 0x008) { /* Overflow */
return FPE_FLTOVF;
} else if (err & 0x012) { /* Denormal, Underflow */
return FPE_FLTUND;
} else if (err & 0x020) { /* Precision */
return FPE_FLTRES;
}
/*
* If we're using IRQ 13, or supposedly even some trap
* X86_TRAP_MF implementations, it's possible
* we get a spurious trap, which is not an error.
*/
return 0;
}
/*
* Kernel-based Virtual Machine driver for Linux
*
* derived from drivers/kvm/kvm_main.c
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <linux/kvm_host.h>
#include "irq.h"
#include "mmu.h"
#include "i8254.h"
#include "tss.h"
#include "kvm_cache_regs.h"
#include "x86.h"
#include "cpuid.h"
#include "pmu.h"
#include "hyperv.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
#include <linux/export.h>
#include <linux/moduleparam.h>
#include <linux/mman.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
#include <linux/user-return-notifier.h>
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/pci.h>
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
#include <linux/mem_encrypt.h>
#include <trace/events/kvm.h>
#include <asm/debugreg.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mce.h>
#include <linux/kernel_stat.h>
#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
#include <asm/div64.h>
#include <asm/irq_remapping.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/intel_pt.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define emul_to_vcpu(ctxt) \
container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
*/
#ifdef CONFIG_X86_64
static
u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
#else
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
static bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);
bool __read_mostly kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 __read_mostly kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
u64 __read_mostly kvm_default_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
unsigned int __read_mostly lapic_timer_advance_ns = 1000;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
bool __read_mostly enable_vmware_backdoor = false;
module_param(enable_vmware_backdoor, bool, S_IRUGO);
EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
static bool __read_mostly force_emulation_prefix = false;
module_param(force_emulation_prefix, bool, S_IRUGO);
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
int nr;
u32 msrs[KVM_NR_SHARED_MSRS];
};
struct kvm_shared_msrs {
struct user_return_notifier urn;
bool registered;
struct kvm_shared_msr_values {
u64 host;
u64 curr;
} values[KVM_NR_SHARED_MSRS];
};
static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
static struct kvm_shared_msrs __percpu *shared_msrs;
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
{ "tlb_flush", VCPU_STAT(tlb_flush) },
{ "invlpg", VCPU_STAT(invlpg) },
{ "exits", VCPU_STAT(exits) },
{ "io_exits", VCPU_STAT(io_exits) },
{ "mmio_exits", VCPU_STAT(mmio_exits) },
{ "signal_exits", VCPU_STAT(signal_exits) },
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "nmi_injections", VCPU_STAT(nmi_injections) },
{ "req_event", VCPU_STAT(req_event) },
{ "l1d_flush", VCPU_STAT(l1d_flush) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ "max_mmu_page_hash_collisions",
VM_STAT(max_mmu_page_hash_collisions) },
{ NULL }
};
u64 __read_mostly host_xcr0;
struct kmem_cache *x86_fpu_cache;
EXPORT_SYMBOL_GPL(x86_fpu_cache);
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
vcpu->arch.apf.gfns[i] = ~0;
}
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
struct kvm_shared_msrs *locals
= container_of(urn, struct kvm_shared_msrs, urn);
struct kvm_shared_msr_values *values;
unsigned long flags;
/*
* Disabling irqs at this point since the following code could be
* interrupted and executed through kvm_arch_hardware_disable()
*/
local_irq_save(flags);
if (locals->registered) {
locals->registered = false;
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
values = &locals->values[slot];
if (values->host != values->curr) {
wrmsrl(shared_msrs_global.msrs[slot], values->host);
values->curr = values->host;
}
}
}
static void shared_msr_update(unsigned slot, u32 msr)
{
u64 value;
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
/* only read, and nobody should modify it at this time,
* so don't need lock */
if (slot >= shared_msrs_global.nr) {
printk(KERN_ERR "kvm: invalid MSR slot!");
return;
}
rdmsrl_safe(msr, &value);
smsr->values[slot].host = value;
smsr->values[slot].curr = value;
}
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
BUG_ON(slot >= KVM_NR_SHARED_MSRS);
shared_msrs_global.msrs[slot] = msr;
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
static void kvm_shared_msr_cpu_online(void)
{
unsigned i;
for (i = 0; i < shared_msrs_global.nr; ++i)
shared_msr_update(i, shared_msrs_global.msrs[i]);
}
int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
int err;
if (((value ^ smsr->values[slot].curr) & mask) == 0)
return 0;
smsr->values[slot].curr = value;
err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
if (err)
return 1;
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
smsr->registered = true;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
static void drop_user_return_notifiers(void)
{
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
if (smsr->registered)
kvm_on_user_return(&smsr->urn);
}
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
return vcpu->arch.apic_base;
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
{
return kvm_apic_mode(kvm_get_apic_base(vcpu));
}
EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
return 1;
if (!msr_info->host_initiated) {
if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
return 1;
if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
return 1;
}
kvm_lapic_set_base(vcpu, msr_info->data);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
asmlinkage __visible void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG();
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
#define EXCPT_BENIGN 0
#define EXCPT_CONTRIBUTORY 1
#define EXCPT_PF 2
static int exception_class(int vector)
{
switch (vector) {
case PF_VECTOR:
return EXCPT_PF;
case DE_VECTOR:
case TS_VECTOR:
case NP_VECTOR:
case SS_VECTOR:
case GP_VECTOR:
return EXCPT_CONTRIBUTORY;
default:
break;
}
return EXCPT_BENIGN;
}
#define EXCPT_FAULT 0
#define EXCPT_TRAP 1
#define EXCPT_ABORT 2
#define EXCPT_INTERRUPT 3
static int exception_type(int vector)
{
unsigned int mask;
if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
return EXCPT_INTERRUPT;
mask = 1 << vector;
/* #DB is trap, as instruction watchpoints are handled elsewhere */
if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
return EXCPT_TRAP;
if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
return EXCPT_ABORT;
/* Reserved exceptions will result in fault */
return EXCPT_FAULT;
}
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
{
unsigned nr = vcpu->arch.exception.nr;
bool has_payload = vcpu->arch.exception.has_payload;
unsigned long payload = vcpu->arch.exception.payload;
if (!has_payload)
return;
switch (nr) {
case DB_VECTOR:
/*
* "Certain debug exceptions may clear bit 0-3. The
* remaining contents of the DR6 register are never
* cleared by the processor".
*/
vcpu->arch.dr6 &= ~DR_TRAP_BITS;
/*
* DR6.RTM is set by all #DB exceptions that don't clear it.
*/
vcpu->arch.dr6 |= DR6_RTM;
vcpu->arch.dr6 |= payload;
/*
* Bit 16 should be set in the payload whenever the #DB
* exception should clear DR6.RTM. This makes the payload
* compatible with the pending debug exceptions under VMX.
* Though not currently documented in the SDM, this also
* makes the payload compatible with the exit qualification
* for #DB exceptions under VMX.
*/
vcpu->arch.dr6 ^= payload & DR6_RTM;
break;
case PF_VECTOR:
vcpu->arch.cr2 = payload;
break;
}
vcpu->arch.exception.has_payload = false;
vcpu->arch.exception.payload = 0;
}
EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
{
u32 prev_nr;
int class1, class2;
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
if (has_error && !is_protmode(vcpu))
has_error = false;
if (reinject) {
/*
* On vmentry, vcpu->arch.exception.pending is only
* true if an event injection was blocked by
* nested_run_pending. In that case, however,
* vcpu_enter_guest requests an immediate exit,
* and the guest shouldn't proceed far enough to
* need reinjection.
*/
WARN_ON_ONCE(vcpu->arch.exception.pending);
vcpu->arch.exception.injected = true;
if (WARN_ON_ONCE(has_payload)) {
/*
* A reinjected event has already
* delivered its payload.
*/
has_payload = false;
payload = 0;
}
} else {
vcpu->arch.exception.pending = true;
vcpu->arch.exception.injected = false;
}
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = has_payload;
vcpu->arch.exception.payload = payload;
/*
* In guest mode, payload delivery should be deferred,
* so that the L1 hypervisor can intercept #PF before
* CR2 is modified (or intercept #DB before DR6 is
* modified under nVMX). However, for ABI
* compatibility with KVM_GET_VCPU_EVENTS and
* KVM_SET_VCPU_EVENTS, we can't delay payload
* delivery unless userspace has enabled this
* functionality via the per-VM capability,
* KVM_CAP_EXCEPTION_PAYLOAD.
*/
if (!vcpu->kvm->arch.exception_payload_enabled ||
!is_guest_mode(vcpu))
kvm_deliver_exception_payload(vcpu);
return;
}
/* to check exception */
prev_nr = vcpu->arch.exception.nr;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
class1 = exception_class(prev_nr);
class2 = exception_class(nr);
if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
/*
* Generate double fault per SDM Table 5-5. Set
* exception.pending = true so that the double fault
* can trigger a nested vmexit.
*/
vcpu->arch.exception.pending = true;
vcpu->arch.exception.injected = false;
vcpu->arch.exception.has_error_code = true;
vcpu->arch.exception.nr = DF_VECTOR;
vcpu->arch.exception.error_code = 0;
vcpu->arch.exception.has_payload = false;
vcpu->arch.exception.payload = 0;
} else
/* replace previous exception with a new one in a hope
that instruction re-execution will regenerate lost
exception */
goto queue;
}
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
unsigned long payload)
{
kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
}
static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
u32 error_code, unsigned long payload)
{
kvm_multiple_exception(vcpu, nr, true, error_code,
true, payload, false);
}
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
if (err)
kvm_inject_gp(vcpu, 0);
else
return kvm_skip_emulated_instruction(vcpu);
return 1;
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
vcpu->arch.exception.nested_apf =
is_guest_mode(vcpu) && fault->async_page_fault;
if (vcpu->arch.exception.nested_apf) {
vcpu->arch.apf.nested_apf_token = fault->address;
kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
} else {
kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
fault->address);
}
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
vcpu->arch.mmu->inject_page_fault(vcpu, fault);
return fault->nested_page_fault;
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
atomic_inc(&vcpu->arch.nmi_queued);
kvm_make_request(KVM_REQ_NMI, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
* a #GP and return false.
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
}
EXPORT_SYMBOL_GPL(kvm_require_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
return true;
kvm_queue_exception(vcpu, UD_VECTOR);
return false;
}
EXPORT_SYMBOL_GPL(kvm_require_dr);
/*
* This function will be used to read from the physical memory of the currently
* running guest. The difference to kvm_vcpu_read_guest_page is that this function
* can read from guest physical or from the guest's guest physical memory.
*/
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t ngfn, void *data, int offset, int len,
u32 access)
{
struct x86_exception exception;
gfn_t real_gfn;
gpa_t ngpa;
ngpa = gfn_to_gpa(ngfn);
real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
if (real_gfn == UNMAPPED_GVA)
return -EFAULT;
real_gfn = gpa_to_gfn(real_gfn);
return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
void *data, int offset, int len, u32 access)
{
return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
data, offset, len, access);
}
/*
* Load the pae pdptrs. Return true is they are all valid.
*/
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
int ret;
u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte),
PFERR_USER_MASK|PFERR_WRITE_MASK);
if (ret < 0) {
ret = 0;
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] &
vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
}
ret = 1;
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty);
out:
return ret;
}
EXPORT_SYMBOL_GPL(load_pdptrs);
bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
bool changed = true;
int offset;
gfn_t gfn;
int r;
if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
return false;
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
return true;
gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
out:
return changed;
}
EXPORT_SYMBOL_GPL(pdptrs_changed);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
#ifdef CONFIG_X86_64
if (cr0 & 0xffffffff00000000UL)
return 1;
#endif
cr0 &= ~CR0_RESERVED_BITS;
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
return 1;
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return 1;
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu))
return 1;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
if (cs_l)
return 1;
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
kvm_read_cr3(vcpu)))
return 1;
}
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
return 1;
kvm_x86_ops->set_cr0(vcpu, cr0);
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
}
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
{
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
!vcpu->guest_xcr0_loaded) {
/* kvm_set_xcr() also depends on this */
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
vcpu->guest_xcr0_loaded = 1;
}
}
static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_xcr0_loaded) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
vcpu->guest_xcr0_loaded = 0;
}
}
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
u64 old_xcr0 = vcpu->arch.xcr0;
u64 valid_bits;
/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
if (index != XCR_XFEATURE_ENABLED_MASK)
return 1;
if (!(xcr0 & XFEATURE_MASK_FP))
return 1;
if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
return 1;
/*
* Do not allow the guest to set bits that we do not support
* saving. However, xcr0 bit 0 is always set, even if the
* emulated CPU does not support XSAVE (see fx_init).
*/
valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
return 1;
if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
(!(xcr0 & XFEATURE_MASK_BNDCSR)))
return 1;
if (xcr0 & XFEATURE_MASK_AVX512) {
if (!(xcr0 & XFEATURE_MASK_YMM))
return 1;
if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
return 1;
}
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
kvm_update_cpuid(vcpu);
return 0;
}
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
__kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
return 1;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
if (cr4 & CR4_RESERVED_BITS)
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
return 1;
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
kvm_read_cr3(vcpu)))
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
return 1;
}
if (kvm_x86_ops->set_cr4(vcpu, cr4))
return 1;
if (((cr4 ^ old_cr4) & pdptr_bits) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
kvm_update_cpuid(vcpu);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
bool skip_tlb_flush = false;
#ifdef CONFIG_X86_64
bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
if (pcid_enabled) {
skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
cr3 &= ~X86_CR3_PCID_NOFLUSH;
}
#endif
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
if (!skip_tlb_flush) {
kvm_mmu_sync_roots(vcpu);
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
return 0;
}
if (is_long_mode(vcpu) &&
(cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
return 1;
else if (is_pae(vcpu) && is_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
if (lapic_in_kernel(vcpu))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
{
int i;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
for (i = 0; i < KVM_NR_DB_REGS; i++)
vcpu->arch.eff_db[i] = vcpu->arch.db[i];
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
}
}
static void kvm_update_dr6(struct kvm_vcpu *vcpu)
{
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
}
static void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
unsigned long dr7;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
kvm_x86_ops->set_dr7(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
u64 fixed = DR6_FIXED_1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
return fixed;
}
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
case 0 ... 3:
vcpu->arch.db[dr] = val;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
case 4:
/* fall through */
case 6:
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
kvm_update_dr6(vcpu);
break;
case 5:
/* fall through */
default: /* 7 */
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
break;
}
return 0;
}
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
if (__kvm_set_dr(vcpu, dr, val)) {
kvm_inject_gp(vcpu, 0);
return 1;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_dr);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
switch (dr) {
case 0 ... 3:
*val = vcpu->arch.db[dr];
break;
case 4:
/* fall through */
case 6:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
*val = vcpu->arch.dr6;
else
*val = kvm_x86_ops->get_dr6(vcpu);
break;
case 5:
/* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
break;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
bool kvm_rdpmc(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
u64 data;
int err;
err = kvm_pmu_rdpmc(vcpu, ecx, &data);
if (err)
return err;
kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
return err;
}
EXPORT_SYMBOL_GPL(kvm_rdpmc);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
* This list is modified at module load time to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
* kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
* may depend on host virtualization features rather than host cpu features.
*/
static u32 msrs_to_save[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
};
static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
HV_X64_MSR_RESET,
HV_X64_MSR_VP_INDEX,
HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_SCONTROL,
HV_X64_MSR_STIMER0_CONFIG,
HV_X64_MSR_VP_ASSIST_PAGE,
HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
HV_X64_MSR_TSC_EMULATION_STATUS,
MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
MSR_IA32_MCG_EXT_CTL,
MSR_IA32_SMBASE,
MSR_SMI_COUNT,
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
};
static unsigned num_emulated_msrs;
/*
* List of msr numbers which are used to expose MSR-based features that
* can be used by a hypervisor to validate requested CPU features.
*/
static u32 msr_based_features[] = {
MSR_IA32_VMX_BASIC,
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
MSR_IA32_VMX_PINBASED_CTLS,
MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
MSR_IA32_VMX_PROCBASED_CTLS,
MSR_IA32_VMX_TRUE_EXIT_CTLS,
MSR_IA32_VMX_EXIT_CTLS,
MSR_IA32_VMX_TRUE_ENTRY_CTLS,
MSR_IA32_VMX_ENTRY_CTLS,
MSR_IA32_VMX_MISC,
MSR_IA32_VMX_CR0_FIXED0,
MSR_IA32_VMX_CR0_FIXED1,
MSR_IA32_VMX_CR4_FIXED0,
MSR_IA32_VMX_CR4_FIXED1,
MSR_IA32_VMX_VMCS_ENUM,
MSR_IA32_VMX_PROCBASED_CTLS2,
MSR_IA32_VMX_EPT_VPID_CAP,
MSR_IA32_VMX_VMFUNC,
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
};
static unsigned int num_msr_based_features;
u64 kvm_get_arch_capabilities(void)
{
u64 data;
rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
/*
* If we're doing cache flushes (either "always" or "cond")
* we will do one whenever the guest does a vmlaunch/vmresume.
* If an outer hypervisor is doing the cache flush for us
* (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
* capability to the guest too, and if EPT is disabled we're not
* vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
* require a nested hypervisor to do a flush of its own.
*/
if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
return data;
}
EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
case MSR_IA32_ARCH_CAPABILITIES:
msr->data = kvm_get_arch_capabilities();
break;
case MSR_IA32_UCODE_REV:
rdmsrl_safe(msr->index, &msr->data);
break;
default:
if (kvm_x86_ops->get_msr_feature(msr))
return 1;
}
return 0;
}
static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
struct kvm_msr_entry msr;
int r;
msr.index = index;
r = kvm_get_msr_feature(&msr);
if (r)
return r;
*data = msr.data;
return 0;
}
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits)
return false;
if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
return false;
if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
return false;
return true;
}
EXPORT_SYMBOL_GPL(kvm_valid_efer);
static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
u64 old_efer = vcpu->arch.efer;
if (!kvm_valid_efer(vcpu, efer))
return 1;
if (is_paging(vcpu)
&& (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
return 1;
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
kvm_x86_ops->set_efer(vcpu, efer);
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
kvm_mmu_reset_context(vcpu);
return 0;
}
void kvm_enable_efer_bits(u64 mask)
{
efer_reserved_bits &= ~mask;
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
switch (msr->index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
case MSR_KERNEL_GS_BASE:
case MSR_CSTAR:
case MSR_LSTAR:
if (is_noncanonical_address(msr->data, vcpu))
return 1;
break;
case MSR_IA32_SYSENTER_EIP:
case MSR_IA32_SYSENTER_ESP:
/*
* IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
* non-canonical address is written on Intel but not on
* AMD (which ignores the top 32-bits, because it does
* not implement 64-bit SYSENTER).
*
* 64-bit code should hence be able to write a non-canonical
* value on AMD. Making the address canonical ensures that
* vmentry does not fail on Intel after writing a non-canonical
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
}
return kvm_x86_ops->set_msr(vcpu, msr);
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
struct msr_data msr;
int r;
msr.index = index;
msr.host_initiated = true;
r = kvm_get_msr(vcpu, &msr);
if (r)
return r;
*data = msr.data;
return 0;
}
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
struct msr_data msr;
msr.data = *data;
msr.index = index;
msr.host_initiated = true;
return kvm_set_msr(vcpu, &msr);
}
#ifdef CONFIG_X86_64
struct pvclock_gtod_data {
seqcount_t seq;
struct { /* extract of a clocksource struct */
int vclock_mode;
u64 cycle_last;
u64 mask;
u32 mult;
u32 shift;
} clock;
u64 boot_ns;
u64 nsec_base;
u64 wall_time_sec;
};
static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
u64 boot_ns;
boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
vdata->clock.mask = tk->tkr_mono.mask;
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
vdata->boot_ns = boot_ns;
vdata->nsec_base = tk->tkr_mono.xtime_nsec;
vdata->wall_time_sec = tk->xtime_sec;
write_seqcount_end(&vdata->seq);
}
#endif
void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
{
/*
* Note: KVM_REQ_PENDING_TIMER is implicitly checked in
* vcpu_enter_guest. This function is only called from
* the physical CPU that is running vcpu.
*/
kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
int version;
int r;
struct pvclock_wall_clock wc;
struct timespec64 boot;
if (!wall_clock)
return;
r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
if (r)
return;
if (version & 1)
++version; /* first time write, random junk */
++version;
if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
return;
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_guest_time_update below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
getboottime64(&boot);
if (kvm->arch.kvmclock_offset) {
struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
boot = timespec64_sub(boot, ts);
}
wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
wc.nsec = boot.tv_nsec;
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
do_shl32_div32(dividend, divisor);
return dividend;
}
static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
s8 *pshift, u32 *pmultiplier)
{
uint64_t scaled64;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
tps64 = base_hz;
scaled64 = scaled_hz;
while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
scaled64 >>= 1;
else
tps32 <<= 1;
shift++;
}
*pshift = shift;
*pmultiplier = div_frac(scaled64, tps32);
pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
__func__, base_hz, scaled_hz, shift, *pmultiplier);
}
#ifdef CONFIG_X86_64
static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
#endif
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
static unsigned long max_tsc_khz;
static u32 adjust_tsc_khz(u32 khz, s32 ppm)
{
u64 v = (u64)khz * (1000000 + ppm);
do_div(v, 1000000);
return v;
}
static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
u64 ratio;
/* Guest TSC same frequency as host TSC? */
if (!scale) {
vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
return 0;
}
/* TSC scaling supported? */
if (!kvm_has_tsc_control) {
if (user_tsc_khz > tsc_khz) {
vcpu->arch.tsc_catchup = 1;
vcpu->arch.tsc_always_catchup = 1;
return 0;
} else {
WARN(1, "user requested TSC rate below hardware speed\n");
return -1;
}
}
/* TSC scaling required - calculate ratio */
ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
user_tsc_khz, tsc_khz);
if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
user_tsc_khz);
return -1;
}
vcpu->arch.tsc_scaling_ratio = ratio;
return 0;
}
static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
return -1;
}
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
&vcpu->arch.virtual_tsc_shift,
&vcpu->arch.virtual_tsc_mult);
vcpu->arch.virtual_tsc_khz = user_tsc_khz;
/*
* Compute the variation in TSC rate which is acceptable
* within the range of tolerance and decide if the
* rate being applied is within that bounds of the hardware
* rate. If so, no scaling or compensation need be done.
*/
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
vcpu->arch.virtual_tsc_mult,
vcpu->arch.virtual_tsc_shift);
tsc += vcpu->arch.this_tsc_write;
return tsc;
}
static inline int gtod_is_based_on_tsc(int mode)
{
return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
}
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
bool vcpus_matched;
struct kvm_arch *ka = &vcpu->kvm->arch;
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&vcpu->kvm->online_vcpus));
/*
* Once the masterclock is enabled, always perform request in
* order to update it.
*
* In order to enable masterclock, the host clocksource must be TSC
* and the vcpus need to have matched TSCs. When that happens,
* perform request to enable masterclock.
*/
if (ka->use_master_clock ||
(gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
atomic_read(&vcpu->kvm->online_vcpus),
ka->use_master_clock, gtod->clock.vclock_mode);
#endif
}
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
{
u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
/*
* Multiply tsc by a fixed point number represented by ratio.
*
* The most significant 64-N bits (mult) of ratio represent the
* integral part of the fixed point number; the remaining N bits
* (frac) represent the fractional part, ie. ratio represents a fixed
* point number (mult + frac * 2^(-N)).
*
* N equals to kvm_tsc_scaling_ratio_frac_bits.
*/
static inline u64 __scale_tsc(u64 ratio, u64 tsc)
{
return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
}
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
{
u64 _tsc = tsc;
u64 ratio = vcpu->arch.tsc_scaling_ratio;
if (ratio != kvm_default_tsc_scaling_ratio)
_tsc = __scale_tsc(ratio, tsc);
return _tsc;
}
EXPORT_SYMBOL_GPL(kvm_scale_tsc);
static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
tsc = kvm_scale_tsc(vcpu, rdtsc());
return target_tsc - tsc;
}
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
{
#ifdef CONFIG_X86_64
/*
* TSC is marked unstable when we're running on Hyper-V,
* 'TSC page' clocksource is good.
*/
if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
return false;
#endif
return check_tsc_unstable();
}
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
bool matched;
bool already_matched;
u64 data = msr->data;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
ns = ktime_get_boot_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
if (data == 0 && msr->host_initiated) {
/*
* detection of vcpu initialization -- need to sync
* with other vCPUs. This particularly helps to keep
* kvm_clock stable after CPU hotplug
*/
synchronizing = true;
} else {
u64 tsc_exp = kvm->arch.last_tsc_write +
nsec_to_cycles(vcpu, elapsed);
u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
/*
* Special case: TSC write with a small delta (1 second)
* of virtual cycle time against real time is
* interpreted as an attempt to synchronize the CPU.
*/
synchronizing = data < tsc_exp + tsc_hz &&
data + tsc_hz > tsc_exp;
}
}
/*
* For a reliable TSC, we can match TSC offsets, and for an unstable
* TSC, we add elapsed time in this computation. We could let the
* compensation code attempt to catch up if we fall behind, but
* it's better to try to match offsets from the beginning.
*/
if (synchronizing &&
vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
if (!kvm_check_tsc_unstable()) {
offset = kvm->arch.cur_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
offset = kvm_compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
} else {
/*
* We split periods of matched TSC writes into generations.
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
* exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
kvm->arch.cur_tsc_generation++;
kvm->arch.cur_tsc_nsec = ns;
kvm->arch.cur_tsc_write = data;
kvm->arch.cur_tsc_offset = offset;
matched = false;
pr_debug("kvm: new tsc generation %llu, clock %llu\n",
kvm->arch.cur_tsc_generation, data);
}
/*
* We also track th most recent recorded KHZ, write and time to
* allow the matching interval to be extended at each write.
*/
kvm->arch.last_tsc_nsec = ns;
kvm->arch.last_tsc_write = data;
kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
vcpu->arch.last_guest_tsc = data;
/* Keep track of which generation this VCPU has synchronized to */
vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
update_ia32_tsc_adjust_msr(vcpu, offset);
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (!already_matched) {
kvm->arch.nr_vcpus_matched_tsc++;
}
kvm_track_tsc_matching(vcpu);
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
}
EXPORT_SYMBOL_GPL(kvm_write_tsc);
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
}
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
adjust_tsc_offset_guest(vcpu, adjustment);
}
#ifdef CONFIG_X86_64
static u64 read_tsc(void)
{
u64 ret = (u64)rdtsc_ordered();
u64 last = pvclock_gtod_data.clock.cycle_last;
if (likely(ret >= last))
return ret;
/*
* GCC likes to generate cmov here, but this branch is extremely
* predictable (it's just a function of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
* ever gets inlined it will generate worse code.
*/
asm volatile ("");
return last;
}
static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
{
long v;
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
u64 tsc_pg_val;
switch (gtod->clock.vclock_mode) {
case VCLOCK_HVCLOCK:
tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
tsc_timestamp);
if (tsc_pg_val != U64_MAX) {
/* TSC page valid */
*mode = VCLOCK_HVCLOCK;
v = (tsc_pg_val - gtod->clock.cycle_last) &
gtod->clock.mask;
} else {
/* TSC page invalid */
*mode = VCLOCK_NONE;
}
break;
case VCLOCK_TSC:
*mode = VCLOCK_TSC;
*tsc_timestamp = read_tsc();
v = (*tsc_timestamp - gtod->clock.cycle_last) &
gtod->clock.mask;
break;
default:
*mode = VCLOCK_NONE;
}
if (*mode == VCLOCK_NONE)
*tsc_timestamp = v = 0;
return v * gtod->clock.mult;
}
static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
int mode;
u64 ns;
do {
seq = read_seqcount_begin(>od->seq);
ns = gtod->nsec_base;
ns += vgettsc(tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
ns += gtod->boot_ns;
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
*t = ns;
return mode;
}
static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
int mode;
u64 ns;
do {
seq = read_seqcount_begin(>od->seq);
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->nsec_base;
ns += vgettsc(tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
return mode;
}
/* returns true if host is using TSC based clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
/* checked again under seqlock below */
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
tsc_timestamp));
}
/* returns true if host is using TSC based clocksource */
static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
u64 *tsc_timestamp)
{
/* checked again under seqlock below */
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
}
#endif
/*
*
* Assuming a stable TSC across physical CPUS, and a stable TSC
* across virtual CPUs, the following condition is possible.
* Each numbered line represents an event visible to both
* CPUs at the next numbered event.
*
* "timespecX" represents host monotonic time. "tscX" represents
* RDTSC value.
*
* VCPU0 on CPU0 | VCPU1 on CPU1
*
* 1. read timespec0,tsc0
* 2. | timespec1 = timespec0 + N
* | tsc1 = tsc0 + M
* 3. transition to guest | transition to guest
* 4. ret0 = timespec0 + (rdtsc - tsc0) |
* 5. | ret1 = timespec1 + (rdtsc - tsc1)
* | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
*
* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
*
* - ret0 < ret1
* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
* ...
* - 0 < N - M => M < N
*
* That is, when timespec0 != timespec1, M < N. Unfortunately that is not
* always the case (the difference between two distinct xtime instances
* might be smaller then the difference between corresponding TSC reads,
* when updating guest vcpus pvclock areas).
*
* To avoid that problem, do not allow visibility of distinct
* system_timestamp/tsc_timestamp values simultaneously: use a master
* copy of host monotonic time values. Update that master copy
* in lockstep.
*
* Rely on synchronization of host TSCs and guest TSCs for monotonicity.
*
*/
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
struct kvm_arch *ka = &kvm->arch;
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&kvm->online_vcpus));
/*
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
host_tsc_clocksource = kvm_get_time_and_clockread(
&ka->master_kernel_ns,
&ka->master_cycle_now);
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
vclock_mode = pvclock_gtod_data.clock.vclock_mode;
trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
vcpus_matched);
#endif
}
void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}
static void kvm_gen_update_masterclock(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
int i;
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;
spin_lock(&ka->pvclock_gtod_sync_lock);
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
pvclock_update_vm_gtod_copy(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
spin_unlock(&ka->pvclock_gtod_sync_lock);
#endif
}
u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
u64 ret;
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
spin_unlock(&ka->pvclock_gtod_sync_lock);
return ktime_get_boot_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
spin_unlock(&ka->pvclock_gtod_sync_lock);
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
if (__this_cpu_read(cpu_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
ret = __pvclock_read_cycles(&hv_clock, rdtsc());
} else
ret = ktime_get_boot_ns() + ka->kvmclock_offset;
put_cpu();
return ret;
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info guest_hv_clock;
if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
&guest_hv_clock, sizeof(guest_hv_clock))))
return;
/* This VCPU is paused, but it's legal for a guest to read another
* VCPU's kvmclock, so we really have to follow the specification where
* it says that version is odd if data is being modified, and even after
* it is consistent.
*
* Version field updates must be kept separate. This is because
* kvm_write_guest_cached might use a "rep movs" instruction, and
* writes within a string instruction are weakly ordered. So there
* are three writes overall.
*
* As a small optimization, only write the version field in the first
* and third write. The vcpu->pv_time cache is still valid, because the
* version field is the first in the struct.
*/
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
if (guest_hv_clock.version & 1)
++guest_hv_clock.version; /* first time write, random junk */
vcpu->hv_clock.version = guest_hv_clock.version + 1;
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock.version));
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
if (vcpu->pvclock_set_guest_stopped_request) {
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
vcpu->pvclock_set_guest_stopped_request = false;
}
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock.version));
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
u8 pvclock_flags;
bool use_master_clock;
kernel_ns = 0;
host_tsc = 0;
/*
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
spin_lock(&ka->pvclock_gtod_sync_lock);
use_master_clock = ka->use_master_clock;
if (use_master_clock) {
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
}
spin_unlock(&ka->pvclock_gtod_sync_lock);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
if (unlikely(tgt_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
return 1;
}
if (!use_master_clock) {
host_tsc = rdtsc();
kernel_ns = ktime_get_boot_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
/*
* We may have to catch up the TSC to match elapsed wall clock
* time for two reasons, even if kvmclock is used.
* 1) CPU could have been running below the maximum TSC rate
* 2) Broken TSC compensation resets the base at each VCPU
* entry to avoid unknown leaps of TSC even when running
* again on the same CPU. This may cause apparent elapsed
* time to disappear, and the guest to stand still or run
* very slowly.
*/
if (vcpu->tsc_catchup) {
u64 tsc = compute_guest_tsc(v, kernel_ns);
if (tsc > tsc_timestamp) {
adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
tsc_timestamp = tsc;
}
}
local_irq_restore(flags);
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
/* If the host uses TSC clocksource, then it is stable */
pvclock_flags = 0;
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time_enabled)
kvm_setup_pvclock_page(v);
if (v == kvm_get_vcpu(v->kvm, 0))
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
/*
* kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from
* the rest of the vcpus to remain static. Otherwise ntp frequency
* correction applies to one vcpu's system_timestamp but not
* the others.
*
* So in those cases, request a kvmclock update for all vcpus.
* We need to rate-limit these requests though, as they can
* considerably slow guests that have a large number of vcpus.
* The time for a remote vcpu to update its kvmclock is bound
* by the delay we use to rate-limit the updates.
*/
#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
static void kvmclock_update_fn(struct work_struct *work)
{
int i;
struct delayed_work *dwork = to_delayed_work(work);
struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
kvmclock_update_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_vcpu_kick(vcpu);
}
}
static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
struct kvm *kvm = v->kvm;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
schedule_delayed_work(&kvm->arch.kvmclock_update_work,
KVMCLOCK_UPDATE_DELAY);
}
#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
static void kvmclock_sync_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
if (!kvmclock_periodic_sync)
return;
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
}
static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
u32 msr = msr_info->index;
u64 data = msr_info->data;
switch (msr) {
case MSR_IA32_MCG_STATUS:
vcpu->arch.mcg_status = data;
break;
case MSR_IA32_MCG_CTL:
if (!(mcg_cap & MCG_CTL_P) &&
(data || !msr_info->host_initiated))
return 1;
if (data != 0 && data != ~(u64)0)
return 1;
vcpu->arch.mcg_ctl = data;
break;
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
u32 offset = msr - MSR_IA32_MC0_CTL;
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
* this to avoid an uncatched #GP in the guest
*/
if ((offset & 0x3) == 0 &&
data != 0 && (data | (1 << 10)) != ~(u64)0)
return -1;
if (!msr_info->host_initiated &&
(offset & 0x3) == 1 && data != 0)
return -1;
vcpu->arch.mce_banks[offset] = data;
break;
}
return 1;
}
return 0;
}
static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
int lm = is_long_mode(vcpu);
u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
: kvm->arch.xen_hvm_config.blob_size_32;
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
u8 *page;
int r;
r = -E2BIG;
if (page_num >= blob_size)
goto out;
r = -ENOMEM;
page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
if (IS_ERR(page)) {
r = PTR_ERR(page);
goto out;
}
if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
goto out_free;
r = 0;
out_free:
kfree(page);
out:
return r;
}
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
/* Bits 3:5 are reserved, Should be zero */
if (data & 0x38)
return 1;
vcpu->arch.apf.msr_val = data;
if (!(data & KVM_ASYNC_PF_ENABLED)) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
return 0;
}
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
sizeof(u32)))
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
kvm_async_pf_wakeup_all(vcpu);
return 0;
}
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
vcpu->arch.pv_time_enabled = false;
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
{
++vcpu->stat.tlb_flush;
kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
}
static void record_steal_time(struct kvm_vcpu *vcpu)
{
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb(vcpu, false);
if (vcpu->arch.st.steal.version & 1)
vcpu->arch.st.steal.version += 1; /* first time write, random junk */
vcpu->arch.st.steal.version += 1;
kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
vcpu->arch.st.steal.steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
vcpu->arch.st.steal.version += 1;
kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
bool pr = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
switch (msr) {
case MSR_AMD64_NB_CFG:
case MSR_IA32_UCODE_WRITE:
case MSR_VM_HSAVE_PA:
case MSR_AMD64_PATCH_LOADER:
case MSR_AMD64_BU_CFG2:
case MSR_AMD64_DC_CFG:
case MSR_F15H_EX_CFG:
break;
case MSR_IA32_UCODE_REV:
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
data &= ~(u64)0x40000; /* ignore Mc status write enable */
if (data != 0) {
vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
return 1;
}
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
"0x%llx\n", data);
return 1;
}
break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
/* We support the non-activated case already */
break;
} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
}
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_TSCDEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
case MSR_IA32_TSC_ADJUST:
if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
}
break;
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
return 1;
vcpu->arch.smbase = data;
break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
return 1;
vcpu->arch.smi_count = data;
break;
case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
struct kvm_arch *ka = &vcpu->kvm->arch;
kvmclock_reset(vcpu);
if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
if (ka->boot_vcpu_runs_old_kvmclock != tmp)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
ka->boot_vcpu_runs_old_kvmclock = tmp;
}
vcpu->arch.time = data;
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
if (!(data & 1))
break;
if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
vcpu->arch.pv_time_enabled = false;
else
vcpu->arch.pv_time_enabled = true;
break;
}
case MSR_KVM_ASYNC_PF_EN:
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
case MSR_KVM_STEAL_TIME:
if (unlikely(!sched_info_on()))
return 1;
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
data & KVM_STEAL_VALID_BITS,
sizeof(struct kvm_steal_time)))
return 1;
vcpu->arch.st.msr_val = data;
if (!(data & KVM_MSR_ENABLED))
break;
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
break;
case MSR_KVM_PV_EOI_EN:
if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
return 1;
break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return set_msr_mce(vcpu, msr_info);
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
pr = true; /* fall through */
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
if (pr || data != 0)
vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
"0x%x data 0x%llx\n", msr, data);
break;
case MSR_K7_CLK_CTL:
/*
* Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from
* AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence
* the need to ignore the workaround.
*/
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
if (report_ignored_msrs)
vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.length = data;
break;
case MSR_AMD64_OSVW_STATUS:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.status = data;
break;
case MSR_PLATFORM_INFO:
if (!msr_info->host_initiated ||
(!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
cpuid_fault_enabled(vcpu)))
return 1;
vcpu->arch.msr_platform_info = data;
break;
case MSR_MISC_FEATURES_ENABLES:
if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
(data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
!supports_cpuid_fault(vcpu)))
return 1;
vcpu->arch.msr_misc_features_enables = data;
break;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
if (!ignore_msrs) {
vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
return 1;
} else {
if (report_ignored_msrs)
vcpu_unimpl(vcpu,
"ignored wrmsr: 0x%x data 0x%llx\n",
msr, data);
break;
}
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_msr_common);
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
return kvm_x86_ops->get_msr(vcpu, msr);
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
u64 data;
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
switch (msr) {
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
data = 0;
break;
case MSR_IA32_MCG_CAP:
data = vcpu->arch.mcg_cap;
break;
case MSR_IA32_MCG_CTL:
if (!(mcg_cap & MCG_CTL_P) && !host)
return 1;
data = vcpu->arch.mcg_ctl;
break;
case MSR_IA32_MCG_STATUS:
data = vcpu->arch.mcg_status;
break;
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
u32 offset = msr - MSR_IA32_MC0_CTL;
data = vcpu->arch.mce_banks[offset];
break;
}
return 1;
}
*pdata = data;
return 0;
}
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
switch (msr_info->index) {
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHFROMIP:
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
case MSR_K8_SYSCFG:
case MSR_K8_TSEG_ADDR:
case MSR_K8_TSEG_MASK:
case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
case MSR_AMD64_BU_CFG2:
case MSR_IA32_PERF_CTL:
case MSR_AMD64_DC_CFG:
case MSR_F15H_EX_CFG:
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
case MSR_IA32_TSC:
msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
break;
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
case 0xcd: /* fsb frequency */
msr_info->data = 3;
break;
/*
* MSR_EBC_FREQUENCY_ID
* Conservative value valid for even the basic CPU models.
* Models 0,1: 000 in bits 23:21 indicating a bus speed of
* 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
* and 266MHz for model 3, or 4. Set Core Clock
* Frequency to System Bus Frequency Ratio to 1 (bits
* 31:24) even though these are only valid for CPU
* models > 2, however guests may end up dividing or
* multiplying by zero otherwise.
*/
case MSR_EBC_FREQUENCY_ID:
msr_info->data = 1 << 24;
break;
case MSR_IA32_APICBASE:
msr_info->data = kvm_get_apic_base(vcpu);
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_TSCDEADLINE:
msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
case MSR_IA32_TSC_ADJUST:
msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
break;
case MSR_IA32_MISC_ENABLE:
msr_info->data = vcpu->arch.ia32_misc_enable_msr;
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
return 1;
msr_info->data = vcpu->arch.smbase;
break;
case MSR_SMI_COUNT:
msr_info->data = vcpu->arch.smi_count;
break;
case MSR_IA32_PERF_STATUS:
/* TSC increment by tick */
msr_info->data = 1000ULL;
/* CPU multiplier */
msr_info->data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
case MSR_KVM_WALL_CLOCK_NEW:
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
case MSR_KVM_SYSTEM_TIME_NEW:
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
msr_info->data = vcpu->arch.apf.msr_val;
break;
case MSR_KVM_STEAL_TIME:
msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
msr_info->host_initiated);
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
* are set to zero, indicating minimum divisors for
* every field.
*
* This prevents guest kernels on AMD host with CPU
* type 6, model 8 and higher from exploding due to
* the rdmsr failing.
*/
msr_info->data = 0x20000000;
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data,
msr_info->host_initiated);
break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
* silicon. It is however accessed by winxp in very narrow
* scenarios where it sets bit #19, itself documented as
* a "reserved" bit. Best effort attempt to source coherent
* read data here should the balance of the register be
* interpreted by the guest:
*
* L2 cache control register 3: 64GB range, 256KB size,
* enabled, latency 0x1, configured
*/
msr_info->data = 0xbe702111;
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.length;
break;
case MSR_AMD64_OSVW_STATUS:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.status;
break;
case MSR_PLATFORM_INFO:
if (!msr_info->host_initiated &&
!vcpu->kvm->arch.guest_can_read_msr_platform_info)
return 1;
msr_info->data = vcpu->arch.msr_platform_info;
break;
case MSR_MISC_FEATURES_ENABLES:
msr_info->data = vcpu->arch.msr_misc_features_enables;
break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
if (!ignore_msrs) {
vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
msr_info->index);
return 1;
} else {
if (report_ignored_msrs)
vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
msr_info->index);
msr_info->data = 0;
}
break;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_msr_common);
/*
* Read or write a bunch of msrs. All parameters are kernel addresses.
*
* @return number of msrs set successfully.
*/
static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
struct kvm_msr_entry *entries,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
int i;
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
return i;
}
/*
* Read or write a bunch of msrs. Parameters are user addresses.
*
* @return number of msrs set successfully.
*/
static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data),
int writeback)
{
struct kvm_msrs msrs;
struct kvm_msr_entry *entries;
int r, n;
unsigned size;
r = -EFAULT;
if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
goto out;
r = -E2BIG;
if (msrs.nmsrs >= MAX_IO_MSRS)
goto out;
size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
entries = memdup_user(user_msrs->entries, size);
if (IS_ERR(entries)) {
r = PTR_ERR(entries);
goto out;
}
r = n = __msr_io(vcpu, &msrs, entries, do_msr);
if (r < 0)
goto out_free;
r = -EFAULT;
if (writeback && copy_to_user(user_msrs->entries, entries, size))
goto out_free;
r = n;
out_free:
kfree(entries);
out:
return r;
}
static inline bool kvm_can_mwait_in_guest(void)
{
return boot_cpu_has(X86_FEATURE_MWAIT) &&
!boot_cpu_has_bug(X86_BUG_MONITOR) &&
boot_cpu_has(X86_FEATURE_ARAT);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r = 0;
switch (ext) {
case KVM_CAP_IRQCHIP:
case KVM_CAP_HLT:
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
case KVM_CAP_PIT:
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_XEN_HVM:
case KVM_CAP_VCPU_EVENTS:
case KVM_CAP_HYPERV:
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
case KVM_CAP_HYPERV_SYNIC:
case KVM_CAP_HYPERV_SYNIC2:
case KVM_CAP_HYPERV_VP_INDEX:
case KVM_CAP_HYPERV_EVENTFD:
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
case KVM_CAP_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
r = KVM_SYNC_X86_VALID_FIELDS;
break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_TSC_STABLE;
break;
case KVM_CAP_X86_DISABLE_EXITS:
r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
if(kvm_can_mwait_in_guest())
r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
case KVM_CAP_X86_SMM:
/* SMBASE is usually relocated above 1M on modern chipsets,
* and SMM handlers might indeed rely on 4G segment limits,
* so do not report SMM to be available if real mode is
* emulated via vm86 mode. Still, do not go to great lengths
* to avoid userspace's usage of the feature, because it is a
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops->cpu_has_accelerated_tpr();
break;
case KVM_CAP_NR_VCPUS:
r = KVM_SOFT_MAX_VCPUS;
break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_NR_MEMSLOTS:
r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
break;
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
case KVM_CAP_XCRS:
r = boot_cpu_has(X86_FEATURE_XSAVE);
break;
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
case KVM_CAP_X2APIC_API:
r = KVM_X2APIC_API_VALID_FLAGS;
break;
case KVM_CAP_NESTED_STATE:
r = kvm_x86_ops->get_nested_state ?
kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
break;
default:
break;
}
return r;
}
long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
void __user *argp = (void __user *)arg;
long r;
switch (ioctl) {
case KVM_GET_MSR_INDEX_LIST: {
struct kvm_msr_list __user *user_msr_list = argp;
struct kvm_msr_list msr_list;
unsigned n;
r = -EFAULT;
if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
goto out;
n = msr_list.nmsrs;
msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
goto out;
r = -E2BIG;
if (n < msr_list.nmsrs)
goto out;
r = -EFAULT;
if (copy_to_user(user_msr_list->indices, &msrs_to_save,
num_msrs_to_save * sizeof(u32)))
goto out;
if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
&emulated_msrs,
num_emulated_msrs * sizeof(u32)))
goto out;
r = 0;
break;
}
case KVM_GET_SUPPORTED_CPUID:
case KVM_GET_EMULATED_CPUID: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
ioctl);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
goto out;
r = 0;
break;
}
case KVM_X86_GET_MCE_CAP_SUPPORTED: {
r = -EFAULT;
if (copy_to_user(argp, &kvm_mce_cap_supported,
sizeof(kvm_mce_cap_supported)))
goto out;
r = 0;
break;
case KVM_GET_MSR_FEATURE_INDEX_LIST: {
struct kvm_msr_list __user *user_msr_list = argp;
struct kvm_msr_list msr_list;
unsigned int n;
r = -EFAULT;
if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
goto out;
n = msr_list.nmsrs;
msr_list.nmsrs = num_msr_based_features;
if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
goto out;
r = -E2BIG;
if (n < msr_list.nmsrs)
goto out;
r = -EFAULT;
if (copy_to_user(user_msr_list->indices, &msr_based_features,
num_msr_based_features * sizeof(u32)))
goto out;
r = 0;
break;
}
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
}
default:
r = -EINVAL;
}
out:
return r;
}
static void wbinvd_ipi(void *garbage)
{
wbinvd();
}
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
if (kvm_x86_ops->has_wbinvd_exit())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
kvm_x86_ops->vcpu_load(vcpu, cpu);
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
vcpu->arch.tsc_offset_adjustment = 0;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (kvm_check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
if (kvm_lapic_hv_timer_in_use(vcpu))
kvm_lapic_restart_hv_timer(vcpu);
/*
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
*/
if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu)
kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
vcpu->cpu = cpu;
}
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal.preempted,
offsetof(struct kvm_steal_time, preempted),
sizeof(vcpu->arch.st.steal.preempted));
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
if (vcpu->preempted)
vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
/*
* Disable page faults because we're in atomic context here.
* kvm_write_guest_offset_cached() would call might_fault()
* that relies on pagefault_disable() to tell if there's a
* bug. NOTE: the write to guest memory may not go through if
* during postcopy live migration or if there's heavy guest
* paging.
*/
pagefault_disable();
/*
* kvm_memslots() will be called by
* kvm_write_guest_offset_cached() so take the srcu lock.
*/
idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_steal_time_set_preempted(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
pagefault_enable();
kvm_x86_ops->vcpu_put(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
* If userspace has set any breakpoints or watchpoints, dr6 is restored
* on every vmexit, but if not, we might have a stale dr6 from the
* guest. do_debug expects dr6 to be cleared after it runs, do the same.
*/
set_debugreg(0, 6);
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
return kvm_apic_get_state(vcpu, s);
}
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
int r;
r = kvm_apic_set_state(vcpu, s);
if (r)
return r;
update_cr8_intercept(vcpu);
return 0;
}
static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
{
return (!lapic_in_kernel(vcpu) ||
kvm_apic_accept_pic_intr(vcpu));
}
/*
* if userspace requested an interrupt window, check that the
* interrupt window is open.
*
* No need to exit to userspace if we already have an interrupt queued.
*/
static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
{
return kvm_arch_interrupt_allowed(vcpu) &&
!kvm_cpu_has_interrupt(vcpu) &&
!kvm_event_needs_reinjection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
}
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
if (irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
if (!irqchip_in_kernel(vcpu->kvm)) {
kvm_queue_interrupt(vcpu, irq->irq, false);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
/*
* With in-kernel LAPIC, we only use this to inject EXTINT, so
* fail for in-kernel 8259.
*/
if (pic_in_kernel(vcpu->kvm))
return -ENXIO;
if (vcpu->arch.pending_external_vector != -1)
return -EEXIST;
vcpu->arch.pending_external_vector = irq->irq;
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
{
kvm_inject_nmi(vcpu);
return 0;
}
static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
{
kvm_make_request(KVM_REQ_SMI, vcpu);
return 0;
}
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
if (tac->flags)
return -EINVAL;
vcpu->arch.tpr_access_reporting = !!tac->enabled;
return 0;
}
static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
u64 mcg_cap)
{
int r;
unsigned bank_num = mcg_cap & 0xff, bank;
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
r = 0;
vcpu->arch.mcg_cap = mcg_cap;
/* Init IA32_MCG_CTL to all 1s */
if (mcg_cap & MCG_CTL_P)
vcpu->arch.mcg_ctl = ~(u64)0;
/* Init IA32_MCi_CTL to all 1s */
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
if (kvm_x86_ops->setup_mce)
kvm_x86_ops->setup_mce(vcpu);
out:
return r;
}
static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
struct kvm_x86_mce *mce)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
u64 *banks = vcpu->arch.mce_banks;
if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
return -EINVAL;
/*
* if IA32_MCG_CTL is not all 1s, the uncorrected error
* reporting is disabled
*/
if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
vcpu->arch.mcg_ctl != ~(u64)0)
return 0;
banks += 4 * mce->bank;
/*
* if IA32_MCi_CTL is not all 1s, the uncorrected error
* reporting is disabled for the bank
*/
if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
return 0;
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
!kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
if (banks[1] & MCI_STATUS_VAL)
mce->status |= MCI_STATUS_OVER;
banks[2] = mce->addr;
banks[3] = mce->misc;
vcpu->arch.mcg_status = mce->mcg_status;
banks[1] = mce->status;
kvm_queue_exception(vcpu, MC_VECTOR);
} else if (!(banks[1] & MCI_STATUS_VAL)
|| !(banks[1] & MCI_STATUS_UC)) {
if (banks[1] & MCI_STATUS_VAL)
mce->status |= MCI_STATUS_OVER;
banks[2] = mce->addr;
banks[3] = mce->misc;
banks[1] = mce->status;
} else
banks[1] |= MCI_STATUS_OVER;
return 0;
}
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
process_nmi(vcpu);
/*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
* isn't advanced, we should expect to encounter the exception
* again.
*/
if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
events->exception.injected = 0;
events->exception.pending = 0;
} else {
events->exception.injected = vcpu->arch.exception.injected;
events->exception.pending = vcpu->arch.exception.pending;
/*
* For ABI compatibility, deliberately conflate
* pending and injected exceptions when
* KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
*/
if (!vcpu->kvm->arch.exception_payload_enabled)
events->exception.injected |=
vcpu->arch.exception.pending;
}
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
events->exception.error_code = vcpu->arch.exception.error_code;
events->exception_has_payload = vcpu->arch.exception.has_payload;
events->exception_payload = vcpu->arch.exception.payload;
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
events->nmi.pad = 0;
events->sipi_vector = 0; /* never valid when reporting to user space */
events->smi.smm = is_smm(vcpu);
events->smi.pending = vcpu->arch.smi_pending;
events->smi.smm_inside_nmi =
!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
events->smi.latched_init = kvm_lapic_latched_init(vcpu);
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SHADOW
| KVM_VCPUEVENT_VALID_SMM);
if (vcpu->kvm->arch.exception_payload_enabled)
events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
memset(&events->reserved, 0, sizeof(events->reserved));
}
static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW
| KVM_VCPUEVENT_VALID_SMM
| KVM_VCPUEVENT_VALID_PAYLOAD))
return -EINVAL;
if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
if (!vcpu->kvm->arch.exception_payload_enabled)
return -EINVAL;
if (events->exception.pending)
events->exception.injected = 0;
else
events->exception_has_payload = 0;
} else {
events->exception.pending = 0;
events->exception_has_payload = 0;
}
if ((events->exception.injected || events->exception.pending) &&
(events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
/* INITs are latched while in SMM */
if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
(events->smi.smm || events->smi.pending) &&
vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
return -EINVAL;
process_nmi(vcpu);
vcpu->arch.exception.injected = events->exception.injected;
vcpu->arch.exception.pending = events->exception.pending;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
vcpu->arch.exception.error_code = events->exception.error_code;
vcpu->arch.exception.has_payload = events->exception_has_payload;
vcpu->arch.exception.payload = events->exception_payload;
vcpu->arch.interrupt.injected = events->interrupt.injected;
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
kvm_x86_ops->set_interrupt_shadow(vcpu,
events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
vcpu->arch.nmi_pending = events->nmi.pending;
kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
u32 hflags = vcpu->arch.hflags;
if (events->smi.smm)
hflags |= HF_SMM_MASK;
else
hflags &= ~HF_SMM_MASK;
kvm_set_hflags(vcpu, hflags);
vcpu->arch.smi_pending = events->smi.pending;
if (events->smi.smm) {
if (events->smi.smm_inside_nmi)
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
if (lapic_in_kernel(vcpu)) {
if (events->smi.latched_init)
set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
else
clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
}
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
unsigned long val;
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
if (dbgregs->flags)
return -EINVAL;
if (dbgregs->dr6 & ~0xffffffffull)
return -EINVAL;
if (dbgregs->dr7 & ~0xffffffffull)
return -EINVAL;
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
kvm_update_dr6(vcpu);
vcpu->arch.dr7 = dbgregs->dr7;
kvm_update_dr7(vcpu);
return 0;
}
#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
{
struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
u64 xstate_bv = xsave->header.xfeatures;
u64 valid;
/*
* Copy legacy XSAVE area, to avoid complications with CPUID
* leaves 0 and 1 in the loop below.
*/
memcpy(dest, xsave, XSAVE_HDR_OFFSET);
/* Set XSTATE_BV */
xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
/*
* Copy each region from the possibly compacted offset to the
* non-compacted offset.
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
u64 feature = valid & -valid;
int index = fls64(feature) - 1;
void *src = get_xsave_addr(xsave, feature);
if (src) {
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
if (feature == XFEATURE_MASK_PKRU)
memcpy(dest + offset, &vcpu->arch.pkru,
sizeof(vcpu->arch.pkru));
else
memcpy(dest + offset, src, size);
}
valid -= feature;
}
}
static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
{
struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
u64 valid;
/*
* Copy legacy XSAVE area, to avoid complications with CPUID
* leaves 0 and 1 in the loop below.
*/
memcpy(xsave, src, XSAVE_HDR_OFFSET);
/* Set XSTATE_BV and possibly XCOMP_BV. */
xsave->header.xfeatures = xstate_bv;
if (boot_cpu_has(X86_FEATURE_XSAVES))
xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
/*
* Copy each region from the non-compacted offset to the
* possibly compacted offset.
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
u64 feature = valid & -valid;
int index = fls64(feature) - 1;
void *dest = get_xsave_addr(xsave, feature);
if (dest) {
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
if (feature == XFEATURE_MASK_PKRU)
memcpy(&vcpu->arch.pkru, src + offset,
sizeof(vcpu->arch.pkru));
else
memcpy(dest, src + offset, size);
}
valid -= feature;
}
}
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
memset(guest_xsave, 0, sizeof(struct kvm_xsave));
fill_xsave((u8 *) guest_xsave->region, vcpu);
} else {
memcpy(guest_xsave->region,
&vcpu->arch.guest_fpu->state.fxsave,
sizeof(struct fxregs_state));
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
XFEATURE_MASK_FPSSE;
}
}
#define XSAVE_MXCSR_OFFSET 24
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
u64 xstate_bv =
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
* Here we allow setting states that are not present in
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
if (xstate_bv & ~kvm_supported_xcr0() ||
mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu->state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
}
return 0;
}
static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
struct kvm_xcrs *guest_xcrs)
{
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
guest_xcrs->nr_xcrs = 0;
return;
}
guest_xcrs->nr_xcrs = 1;
guest_xcrs->flags = 0;
guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
}
static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
struct kvm_xcrs *guest_xcrs)
{
int i, r = 0;
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -EINVAL;
if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
return -EINVAL;
for (i = 0; i < guest_xcrs->nr_xcrs; i++)
/* Only support XCR0 currently */
if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
guest_xcrs->xcrs[i].value);
break;
}
if (r)
r = -EINVAL;
return r;
}
/*
* kvm_set_guest_paused() indicates to the guest kernel that it has been
* stopped by the hypervisor. This function will be called from the host only.
* EINVAL is returned when the host attempts to set the flag for a guest that
* does not support pv clocks.
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
if (!vcpu->arch.pv_time_enabled)
return -EINVAL;
vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
int r;
uint16_t vmcs_version;
void __user *user_ptr;
if (cap->flags)
return -EINVAL;
switch (cap->cap) {
case KVM_CAP_HYPERV_SYNIC2:
if (cap->args[0])
return -EINVAL;
/* fall through */
case KVM_CAP_HYPERV_SYNIC:
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
return kvm_hv_activate_synic(vcpu, cap->cap ==
KVM_CAP_HYPERV_SYNIC2);
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
if (!kvm_x86_ops->nested_enable_evmcs)
return -ENOTTY;
r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
if (!r) {
user_ptr = (void __user *)(uintptr_t)cap->args[0];
if (copy_to_user(user_ptr, &vmcs_version,
sizeof(vmcs_version)))
r = -EFAULT;
}
return r;
default:
return -EINVAL;
}
}
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
union {
struct kvm_lapic_state *lapic;
struct kvm_xsave *xsave;
struct kvm_xcrs *xcrs;
void *buffer;
} u;
vcpu_load(vcpu);
u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
if (!u.lapic)
goto out;
r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
}
case KVM_SET_LAPIC: {
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
if (IS_ERR(u.lapic)) {
r = PTR_ERR(u.lapic);
goto out_nofree;
}
r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
break;
}
case KVM_INTERRUPT: {
struct kvm_interrupt irq;
r = -EFAULT;
if (copy_from_user(&irq, argp, sizeof(irq)))
goto out;
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
break;
}
case KVM_NMI: {
r = kvm_vcpu_ioctl_nmi(vcpu);
break;
}
case KVM_SMI: {
r = kvm_vcpu_ioctl_smi(vcpu);
break;
}
case KVM_SET_CPUID: {
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
break;
}
case KVM_SET_CPUID2: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
break;
}
case KVM_GET_CPUID2: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
goto out;
r = 0;
break;
}
case KVM_GET_MSRS: {
int idx = srcu_read_lock(&vcpu->kvm->srcu);
r = msr_io(vcpu, argp, do_get_msr, 1);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_SET_MSRS: {
int idx = srcu_read_lock(&vcpu->kvm->srcu);
r = msr_io(vcpu, argp, do_set_msr, 0);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_TPR_ACCESS_REPORTING: {
struct kvm_tpr_access_ctl tac;
r = -EFAULT;
if (copy_from_user(&tac, argp, sizeof(tac)))
goto out;
r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &tac, sizeof(tac)))
goto out;
r = 0;
break;
};
case KVM_SET_VAPIC_ADDR: {
struct kvm_vapic_addr va;
int idx;
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof(va)))
goto out;
idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_X86_SETUP_MCE: {
u64 mcg_cap;
r = -EFAULT;
if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
goto out;
r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
break;
}
case KVM_X86_SET_MCE: {
struct kvm_x86_mce mce;
r = -EFAULT;
if (copy_from_user(&mce, argp, sizeof(mce)))
goto out;
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
break;
}
case KVM_GET_VCPU_EVENTS: {
struct kvm_vcpu_events events;
kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
r = -EFAULT;
if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
break;
r = 0;
break;
}
case KVM_SET_VCPU_EVENTS: {
struct kvm_vcpu_events events;
r = -EFAULT;
if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
break;
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
break;
}
case KVM_GET_DEBUGREGS: {
struct kvm_debugregs dbgregs;
kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
r = -EFAULT;
if (copy_to_user(argp, &dbgregs,
sizeof(struct kvm_debugregs)))
break;
r = 0;
break;
}
case KVM_SET_DEBUGREGS: {
struct kvm_debugregs dbgregs;
r = -EFAULT;
if (copy_from_user(&dbgregs, argp,
sizeof(struct kvm_debugregs)))
break;
r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
break;
}
case KVM_GET_XSAVE: {
u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
r = -EFAULT;
if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
break;
r = 0;
break;
}
case KVM_SET_XSAVE: {
u.xsave = memdup_user(argp, sizeof(*u.xsave));
if (IS_ERR(u.xsave)) {
r = PTR_ERR(u.xsave);
goto out_nofree;
}
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
}
case KVM_GET_XCRS: {
u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
r = -ENOMEM;
if (!u.xcrs)
break;
kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
r = -EFAULT;
if (copy_to_user(argp, u.xcrs,
sizeof(struct kvm_xcrs)))
break;
r = 0;
break;
}
case KVM_SET_XCRS: {
u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
if (IS_ERR(u.xcrs)) {
r = PTR_ERR(u.xcrs);
goto out_nofree;
}
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
}
case KVM_SET_TSC_KHZ: {
u32 user_tsc_khz;
r = -EINVAL;
user_tsc_khz = (u32)arg;
if (user_tsc_khz >= kvm_max_guest_tsc_khz)
goto out;
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
r = 0;
goto out;
}
case KVM_GET_TSC_KHZ: {
r = vcpu->arch.virtual_tsc_khz;
goto out;
}
case KVM_KVMCLOCK_CTRL: {
r = kvm_set_guest_paused(vcpu);
goto out;
}
case KVM_ENABLE_CAP: {
struct kvm_enable_cap cap;
r = -EFAULT;
if (copy_from_user(&cap, argp, sizeof(cap)))
goto out;
r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
break;
}
case KVM_GET_NESTED_STATE: {
struct kvm_nested_state __user *user_kvm_nested_state = argp;
u32 user_data_size;
r = -EINVAL;
if (!kvm_x86_ops->get_nested_state)
break;
BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
r = -EFAULT;
if (get_user(user_data_size, &user_kvm_nested_state->size))
break;
r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
user_data_size);
if (r < 0)
break;
if (r > user_data_size) {
if (put_user(r, &user_kvm_nested_state->size))
r = -EFAULT;
else
r = -E2BIG;
break;
}
r = 0;
break;
}
case KVM_SET_NESTED_STATE: {
struct kvm_nested_state __user *user_kvm_nested_state = argp;
struct kvm_nested_state kvm_state;
r = -EINVAL;
if (!kvm_x86_ops->set_nested_state)
break;
r = -EFAULT;
if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
break;
r = -EINVAL;
if (kvm_state.size < sizeof(kvm_state))
break;
if (kvm_state.flags &
~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
| KVM_STATE_NESTED_EVMCS))
break;
/* nested_run_pending implies guest_mode. */
if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
&& !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
break;
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
break;
}
case KVM_GET_SUPPORTED_HV_CPUID: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
cpuid_arg->entries);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
goto out;
r = 0;
break;
}
default:
r = -EINVAL;
}
out:
kfree(u.buffer);
out_nofree:
vcpu_put(vcpu);
return r;
}
vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
}
static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
{
int ret;
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
ret = kvm_x86_ops->set_tss_addr(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
u32 kvm_nr_mmu_pages)
{
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
mutex_lock(&kvm->slots_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
mutex_unlock(&kvm->slots_lock);
return 0;
}
static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
return kvm->arch.n_max_mmu_pages;
}
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
memcpy(&chip->chip.pic, &pic->pics[0],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_PIC_SLAVE:
memcpy(&chip->chip.pic, &pic->pics[1],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_IOAPIC:
kvm_get_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
break;
}
return r;
}
static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
spin_lock(&pic->lock);
memcpy(&pic->pics[0], &chip->chip.pic,
sizeof(struct kvm_pic_state));
spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
spin_lock(&pic->lock);
memcpy(&pic->pics[1], &chip->chip.pic,
sizeof(struct kvm_pic_state));
spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_IOAPIC:
kvm_set_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
break;
}
kvm_pic_update_irq(pic);
return r;
}
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
mutex_lock(&kps->lock);
memcpy(ps, &kps->channels, sizeof(*ps));
mutex_unlock(&kps->lock);
return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int i;
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
for (i = 0; i < 3; i++)
kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
memset(&ps->reserved, 0, sizeof(ps->reserved));
return 0;
}
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int start = 0;
int i;
u32 prev_legacy, cur_legacy;
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
if (!prev_legacy && cur_legacy)
start = 1;
memcpy(&pit->pit_state.channels, &ps->channels,
sizeof(pit->pit_state.channels));
pit->pit_state.flags = ps->flags;
for (i = 0; i < 3; i++)
kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
start && i == 0);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
struct kvm_reinject_control *control)
{
struct kvm_pit *pit = kvm->arch.vpit;
if (!pit)
return -ENXIO;
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
*/
mutex_lock(&pit->pit_state.lock);
kvm_pit_set_reinject(pit, control->pit_reinject);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
/**
* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
* @kvm: kvm instance
* @log: slot id and address to which we copy the log
*
* Steps 1-4 below provide general overview of dirty page logging. See
* kvm_get_dirty_log_protect() function description for additional details.
*
* We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
* always flush the TLB (step 4) even if previous step failed and the dirty
* bitmap may be corrupt. Regardless of previous outcome the KVM logging API
* does not preclude user space subsequent dirty log read. Flushing TLB ensures
* writes will be marked dirty for next log read.
*
* 1. Take a snapshot of the bit and clear it if needed.
* 2. Write protect the corresponding page.
* 3. Copy the snapshot to the userspace.
* 4. Flush TLB's if needed.
*/
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
bool flush = false;
int r;
mutex_lock(&kvm->slots_lock);
/*
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
*/
if (kvm_x86_ops->flush_log_dirty)
kvm_x86_ops->flush_log_dirty(kvm);
r = kvm_get_dirty_log_protect(kvm, log, &flush);
/*
* All the TLBs can be flushed out of mmu lock, see the comments in
* kvm_mmu_slot_remove_write_access().
*/
lockdep_assert_held(&kvm->slots_lock);
if (flush)
kvm_flush_remote_tlbs(kvm);
mutex_unlock(&kvm->slots_lock);
return r;
}
int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
{
bool flush = false;
int r;
mutex_lock(&kvm->slots_lock);
/*
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
*/
if (kvm_x86_ops->flush_log_dirty)
kvm_x86_ops->flush_log_dirty(kvm);
r = kvm_clear_dirty_log_protect(kvm, log, &flush);
/*
* All the TLBs can be flushed out of mmu lock, see the comments in
* kvm_mmu_slot_remove_write_access().
*/
lockdep_assert_held(&kvm->slots_lock);
if (flush)
kvm_flush_remote_tlbs(kvm);
mutex_unlock(&kvm->slots_lock);
return r;
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
bool line_status)
{
if (!irqchip_in_kernel(kvm))
return -ENXIO;
irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event->irq, irq_event->level,
line_status);
return 0;
}
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
int r;
if (cap->flags)
return -EINVAL;
switch (cap->cap) {
case KVM_CAP_DISABLE_QUIRKS:
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
break;
case KVM_CAP_SPLIT_IRQCHIP: {
mutex_lock(&kvm->lock);
r = -EINVAL;
if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
goto split_irqchip_unlock;
r = -EEXIST;
if (irqchip_in_kernel(kvm))
goto split_irqchip_unlock;
if (kvm->created_vcpus)
goto split_irqchip_unlock;
r = kvm_setup_empty_irq_routing(kvm);
if (r)
goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
case KVM_CAP_X2APIC_API:
r = -EINVAL;
if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
break;
if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
kvm->arch.x2apic_format = true;
if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
kvm->arch.x2apic_broadcast_quirk_disabled = true;
r = 0;
break;
case KVM_CAP_X86_DISABLE_EXITS:
r = -EINVAL;
if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
break;
if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
kvm_can_mwait_in_guest())
kvm->arch.mwait_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
kvm->arch.hlt_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
kvm->arch.pause_in_guest = true;
r = 0;
break;
case KVM_CAP_MSR_PLATFORM_INFO:
kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
r = 0;
break;
case KVM_CAP_EXCEPTION_PAYLOAD:
kvm->arch.exception_payload_enabled = cap->args[0];
r = 0;
break;
default:
r = -EINVAL;
break;
}
return r;
}
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
/*
* This union makes it completely explicit to gcc-3.x
* that these two variables' stack usage should be
* combined, not added together.
*/
union {
struct kvm_pit_state ps;
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
switch (ioctl) {
case KVM_SET_TSS_ADDR:
r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
break;
case KVM_SET_IDENTITY_MAP_ADDR: {
u64 ident_addr;
mutex_lock(&kvm->lock);
r = -EINVAL;
if (kvm->created_vcpus)
goto set_identity_unlock;
r = -EFAULT;
if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
goto set_identity_unlock;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
set_identity_unlock:
mutex_unlock(&kvm->lock);
break;
}
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
r = -EEXIST;
if (irqchip_in_kernel(kvm))
goto create_irqchip_unlock;
r = -EINVAL;
if (kvm->created_vcpus)
goto create_irqchip_unlock;
r = kvm_pic_init(kvm);
if (r)
goto create_irqchip_unlock;
r = kvm_ioapic_init(kvm);
if (r) {
kvm_pic_destroy(kvm);
goto create_irqchip_unlock;
}
r = kvm_setup_default_irq_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
goto create_irqchip_unlock;
}
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
case KVM_CREATE_PIT:
u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
goto create_pit;
case KVM_CREATE_PIT2:
r = -EFAULT;
if (copy_from_user(&u.pit_config, argp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
mutex_lock(&kvm->lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
r = -ENOMEM;
kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
mutex_unlock(&kvm->lock);
break;
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip;
chip = memdup_user(argp, sizeof(*chip));
if (IS_ERR(chip)) {
r = PTR_ERR(chip);
goto out;
}
r = -ENXIO;
if (!irqchip_kernel(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
goto get_irqchip_out;
r = -EFAULT;
if (copy_to_user(argp, chip, sizeof(*chip)))
goto get_irqchip_out;
r = 0;
get_irqchip_out:
kfree(chip);
break;
}
case KVM_SET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip;
chip = memdup_user(argp, sizeof(*chip));
if (IS_ERR(chip)) {
r = PTR_ERR(chip);
goto out;
}
r = -ENXIO;
if (!irqchip_kernel(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
goto set_irqchip_out;
r = 0;
set_irqchip_out:
kfree(chip);
break;
}
case KVM_GET_PIT: {
r = -EFAULT;
if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
goto out;
r = 0;
break;
}
case KVM_SET_PIT: {
r = -EFAULT;
if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
break;
}
case KVM_GET_PIT2: {
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
goto out;
r = 0;
break;
}
case KVM_SET_PIT2: {
r = -EFAULT;
if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
break;
}
case KVM_REINJECT_CONTROL: {
struct kvm_reinject_control control;
r = -EFAULT;
if (copy_from_user(&control, argp, sizeof(control)))
goto out;
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
r = -EBUSY;
else
kvm->arch.bsp_vcpu_id = arg;
mutex_unlock(&kvm->lock);
break;
case KVM_XEN_HVM_CONFIG: {
struct kvm_xen_hvm_config xhc;
r = -EFAULT;
if (copy_from_user(&xhc, argp, sizeof(xhc)))
goto out;
r = -EINVAL;
if (xhc.flags)
goto out;
memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
r = 0;
break;
}
case KVM_SET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
r = -EFAULT;
if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
goto out;
r = -EINVAL;
if (user_ns.flags)
goto out;
r = 0;
/*
* TODO: userspace has to take care of races with VCPU_RUN, so
* kvm_gen_update_masterclock() can be cut down to locked
* pvclock_update_vm_gtod_copy().
*/
kvm_gen_update_masterclock(kvm);
now_ns = get_kvmclock_ns(kvm);
kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
break;
}
case KVM_GET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
now_ns = get_kvmclock_ns(kvm);
user_ns.clock = now_ns;
user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
memset(&user_ns.pad, 0, sizeof(user_ns.pad));
r = -EFAULT;
if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
goto out;
r = 0;
break;
}
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (kvm_x86_ops->mem_enc_op)
r = kvm_x86_ops->mem_enc_op(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
struct kvm_enc_region region;
r = -EFAULT;
if (copy_from_user(®ion, argp, sizeof(region)))
goto out;
r = -ENOTTY;
if (kvm_x86_ops->mem_enc_reg_region)
r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
struct kvm_enc_region region;
r = -EFAULT;
if (copy_from_user(®ion, argp, sizeof(region)))
goto out;
r = -ENOTTY;
if (kvm_x86_ops->mem_enc_unreg_region)
r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion);
break;
}
case KVM_HYPERV_EVENTFD: {
struct kvm_hyperv_eventfd hvevfd;
r = -EFAULT;
if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
goto out;
r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
break;
}
default:
r = -ENOTTY;
}
out:
return r;
}
static void kvm_init_msr_list(void)
{
u32 dummy[2];
unsigned i, j;
for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
/*
* Even MSRs that are valid in the host may not be exposed
* to the guests in some cases.
*/
switch (msrs_to_save[i]) {
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported())
continue;
break;
case MSR_TSC_AUX:
if (!kvm_x86_ops->rdtscp_supported())
continue;
break;
case MSR_IA32_RTIT_CTL:
case MSR_IA32_RTIT_STATUS:
if (!kvm_x86_ops->pt_supported())
continue;
break;
case MSR_IA32_RTIT_CR3_MATCH:
if (!kvm_x86_ops->pt_supported() ||
!intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
continue;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
case MSR_IA32_RTIT_OUTPUT_MASK:
if (!kvm_x86_ops->pt_supported() ||
(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
!intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
continue;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
if (!kvm_x86_ops->pt_supported() ||
msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
}
default:
break;
}
if (j < i)
msrs_to_save[j] = msrs_to_save[i];
j++;
}
num_msrs_to_save = j;
for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
continue;
if (j < i)
emulated_msrs[j] = emulated_msrs[i];
j++;
}
num_emulated_msrs = j;
for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
struct kvm_msr_entry msr;
msr.index = msr_based_features[i];
if (kvm_get_msr_feature(&msr))
continue;
if (j < i)
msr_based_features[j] = msr_based_features[i];
j++;
}
num_msr_based_features = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
const void *v)
{
int handled = 0;
int n;
do {
n = min(len, 8);
if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
&& kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
handled += n;
addr += n;
len -= n;
v += n;
} while (len);
return handled;
}
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
{
int handled = 0;
int n;
do {
n = min(len, 8);
if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
handled += n;
addr += n;
len -= n;
v += n;
} while (len);
return handled;
}
static void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
kvm_x86_ops->set_segment(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
kvm_x86_ops->get_segment(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception)
{
gpa_t t_gpa;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
return t_gpa;
}
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
offset, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
}
bytes -= toread;
data += toread;
addr += toread;
}
out:
return r;
}
/* used for instruction fetching */
static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
/* Inline kvm_read_guest_virt_helper for speed. */
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
exception);
if (unlikely(gpa == UNMAPPED_GVA))
return X86EMUL_PROPAGATE_FAULT;
offset = addr & (PAGE_SIZE-1);
if (WARN_ON(offset + bytes > PAGE_SIZE))
bytes = (unsigned)PAGE_SIZE - offset;
ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
offset, bytes);
if (unlikely(ret < 0))
return X86EMUL_IO_NEEDED;
return X86EMUL_CONTINUE;
}
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
* is returned, but our callers are not ready for that and they blindly
* call kvm_inject_page_fault. Ensure that they at least do not leak
* uninitialized kernel stack memory into cr2 and error code.
*/
memset(exception, 0, sizeof(*exception));
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
exception);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception, bool system)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = 0;
if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
}
static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
unsigned long addr, void *val, unsigned int bytes)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
}
static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
access,
exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
}
bytes -= towrite;
data += towrite;
addr += towrite;
}
out:
return r;
}
static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception,
bool system)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = PFERR_WRITE_MASK;
if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
access, exception);
}
int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception)
{
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
int handle_ud(struct kvm_vcpu *vcpu)
{
int emul_type = EMULTYPE_TRAP_UD;
enum emulation_result er;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
if (force_emulation_prefix &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
emul_type = 0;
}
er = kvm_emulate_instruction(vcpu, emul_type);
if (er == EMULATE_USER_EXIT)
return 0;
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
EXPORT_SYMBOL_GPL(handle_ud);
static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t gpa, bool write)
{
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
return 1;
if (vcpu_match_mmio_gpa(vcpu, gpa)) {
trace_vcpu_match_mmio(gva, gpa, write, true);
return 1;
}
return 0;
}
static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
* currently PKRU is only applied to ept enabled guest so
* there is no pkey in EPT page table for L1 guest or EPT
* shadow page table for L2 guest.
*/
if (vcpu_match_mmio_gva(vcpu, gva)
&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
vcpu->arch.access, 0, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}
*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
if (*gpa == UNMAPPED_GVA)
return -1;
return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes)
{
int ret;
ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
kvm_page_track_write(vcpu, gpa, val, bytes);
return 1;
}
struct read_write_emulator_ops {
int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
int bytes);
int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes);
int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
int bytes, void *val);
int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes);
bool write;
};
static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
{
if (vcpu->mmio_read_completed) {
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
vcpu->mmio_fragments[0].gpa, val);
vcpu->mmio_read_completed = 0;
return 1;
}
return 0;
}
static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
}
static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
return emulator_write_phys(vcpu, gpa, val, bytes);
}
static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
{
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
return vcpu_mmio_write(vcpu, gpa, bytes, val);
}
static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
return X86EMUL_IO_NEEDED;
}
static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
return X86EMUL_CONTINUE;
}
static const struct read_write_emulator_ops read_emultor = {
.read_write_prepare = read_prepare,
.read_write_emulate = read_emulate,
.read_write_mmio = vcpu_mmio_read,
.read_write_exit_mmio = read_exit_mmio,
};
static const struct read_write_emulator_ops write_emultor = {
.read_write_emulate = write_emulate,
.read_write_mmio = write_mmio,
.read_write_exit_mmio = write_exit_mmio,
.write = true,
};
static int emulator_read_write_onepage(unsigned long addr, void *val,
unsigned int bytes,
struct x86_exception *exception,
struct kvm_vcpu *vcpu,
const struct read_write_emulator_ops *ops)
{
gpa_t gpa;
int handled, ret;
bool write = ops->write;
struct kvm_mmio_fragment *frag;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
/*
* If the exit was due to a NPF we may already have a GPA.
* If the GPA is present, use it to avoid the GVA to GPA table walk.
* Note, this cannot be used on string operations since string
* operation using rep will only have the initial GPA from the NPF
* occurred.
*/
if (vcpu->arch.gpa_available &&
emulator_can_use_gpa(ctxt) &&
(addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
gpa = vcpu->arch.gpa_val;
ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
} else {
ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;
}
if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE;
/*
* Is this MMIO handled locally?
*/
handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
if (handled == bytes)
return X86EMUL_CONTINUE;
gpa += handled;
bytes -= handled;
val += handled;
WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
frag->gpa = gpa;
frag->data = val;
frag->len = bytes;
return X86EMUL_CONTINUE;
}
static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
void *val, unsigned int bytes,
struct x86_exception *exception,
const struct read_write_emulator_ops *ops)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
int rc;
if (ops->read_write_prepare &&
ops->read_write_prepare(vcpu, val, bytes))
return X86EMUL_CONTINUE;
vcpu->mmio_nr_fragments = 0;
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
int now;
now = -addr & ~PAGE_MASK;
rc = emulator_read_write_onepage(addr, val, now, exception,
vcpu, ops);
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
if (ctxt->mode != X86EMUL_MODE_PROT64)
addr = (u32)addr;
val += now;
bytes -= now;
}
rc = emulator_read_write_onepage(addr, val, bytes, exception,
vcpu, ops);
if (rc != X86EMUL_CONTINUE)
return rc;
if (!vcpu->mmio_nr_fragments)
return rc;
gpa = vcpu->mmio_fragments[0].gpa;
vcpu->mmio_needed = 1;
vcpu->mmio_cur_fragment = 0;
vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = gpa;
return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
}
static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
void *val,
unsigned int bytes,
struct x86_exception *exception)
{
return emulator_read_write(ctxt, addr, val, bytes,
exception, &read_emultor);
}
static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
const void *val,
unsigned int bytes,
struct x86_exception *exception)
{
return emulator_read_write(ctxt, addr, (void *)val, bytes,
exception, &write_emultor);
}
#define CMPXCHG_TYPE(t, ptr, old, new) \
(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
#ifdef CONFIG_X86_64
# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
#else
# define CMPXCHG64(ptr, old, new) \
(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
#endif
static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
struct page *page;
char *kaddr;
bool exchanged;
/* guests cmpxchg8b have to be emulated atomically */
if (bytes > 8 || (bytes & (bytes - 1)))
goto emul_write;
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
if (gpa == UNMAPPED_GVA ||
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto emul_write;
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto emul_write;
kaddr = kmap_atomic(page);
kaddr += offset_in_page(gpa);
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
break;
case 2:
exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
break;
case 4:
exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
break;
case 8:
exchanged = CMPXCHG64(kaddr, old, new);
break;
default:
BUG();
}
kunmap_atomic(kaddr);
kvm_release_page_dirty(page);
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
kvm_page_track_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
int r = 0, i;
for (i = 0; i < vcpu->arch.pio.count; i++) {
if (vcpu->arch.pio.in)
r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
vcpu->arch.pio.size, pd);
else
r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
vcpu->arch.pio.port, vcpu->arch.pio.size,
pd);
if (r)
break;
pd += vcpu->arch.pio.size;
}
return r;
}
static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val,
unsigned int count, bool in)
{
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = in;
vcpu->arch.pio.count = count;
vcpu->arch.pio.size = size;
if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
vcpu->arch.pio.count = 0;
return 1;
}
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = size;
vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
vcpu->run->io.count = count;
vcpu->run->io.port = port;
return 0;
}
static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
int size, unsigned short port, void *val,
unsigned int count)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int ret;
if (vcpu->arch.pio.count)
goto data_avail;
memset(vcpu->arch.pio_data, 0, size * count);
ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
if (ret) {
data_avail:
memcpy(val, vcpu->arch.pio_data, size * count);
trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
vcpu->arch.pio.count = 0;
return 1;
}
return 0;
}
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
int size, unsigned short port,
const void *val, unsigned int count)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
memcpy(vcpu->arch.pio_data, val, size * count);
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
return kvm_x86_ops->get_segment_base(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
{
kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
}
static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
{
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
if (kvm_x86_ops->has_wbinvd_exit()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
} else
wbinvd();
return X86EMUL_CONTINUE;
}
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
kvm_emulate_wbinvd_noskip(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
{
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long *dest)
{
return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value)
{
return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
{
return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
}
static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
unsigned long value;
switch (cr) {
case 0:
value = kvm_read_cr0(vcpu);
break;
case 2:
value = vcpu->arch.cr2;
break;
case 3:
value = kvm_read_cr3(vcpu);
break;
case 4:
value = kvm_read_cr4(vcpu);
break;
case 8:
value = kvm_get_cr8(vcpu);
break;
default:
kvm_err("%s: unexpected cr %u\n", __func__, cr);
return 0;
}
return value;
}
static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int res = 0;
switch (cr) {
case 0:
res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
break;
case 2:
vcpu->arch.cr2 = val;
break;
case 3:
res = kvm_set_cr3(vcpu, val);
break;
case 4:
res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
res = kvm_set_cr8(vcpu, val);
break;
default:
kvm_err("%s: unexpected cr %u\n", __func__, cr);
res = -1;
}
return res;
}
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
struct x86_emulate_ctxt *ctxt, int seg)
{
return get_segment_base(emul_to_vcpu(ctxt), seg);
}
static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
struct desc_struct *desc, u32 *base3,
int seg)
{
struct kvm_segment var;
kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
*selector = var.selector;
if (var.unusable) {
memset(desc, 0, sizeof(*desc));
if (base3)
*base3 = 0;
return false;
}
if (var.g)
var.limit >>= 12;
set_desc_limit(desc, var.limit);
set_desc_base(desc, (unsigned long)var.base);
#ifdef CONFIG_X86_64
if (base3)
*base3 = var.base >> 32;
#endif
desc->type = var.type;
desc->s = var.s;
desc->dpl = var.dpl;
desc->p = var.present;
desc->avl = var.avl;
desc->l = var.l;
desc->d = var.db;
desc->g = var.g;
return true;
}
static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
struct desc_struct *desc, u32 base3,
int seg)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
struct kvm_segment var;
var.selector = selector;
var.base = get_desc_base(desc);
#ifdef CONFIG_X86_64
var.base |= ((u64)base3) << 32;
#endif
var.limit = get_desc_limit(desc);
if (desc->g)
var.limit = (var.limit << 12) | 0xfff;
var.type = desc->type;
var.dpl = desc->dpl;
var.db = desc->d;
var.s = desc->s;
var.l = desc->l;
var.g = desc->g;
var.avl = desc->avl;
var.present = desc->p;
var.unusable = !var.present;
var.padding = 0;
kvm_set_segment(vcpu, &var, seg);
return;
}
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
struct msr_data msr;
int r;
msr.index = msr_index;
msr.host_initiated = false;
r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
if (r)
return r;
*pdata = msr.data;
return 0;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
struct msr_data msr;
msr.data = data;
msr.index = msr_index;
msr.host_initiated = false;
return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
return vcpu->arch.smbase;
}
static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
vcpu->arch.smbase = smbase;
}
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc, u64 *pdata)
{
return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
}
static void emulator_halt(struct x86_emulate_ctxt *ctxt)
{
emul_to_vcpu(ctxt)->arch.halt_request = 1;
}
static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
}
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
{
return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
}
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read(emul_to_vcpu(ctxt), reg);
}
static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
{
kvm_register_write(emul_to_vcpu(ctxt), reg, val);
}
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
}
static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
{
return emul_to_vcpu(ctxt)->arch.hflags;
}
static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
{
kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
}
static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
{
return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
}
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
.read_std = emulator_read_std,
.write_std = emulator_write_std,
.read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
.invlpg = emulator_invlpg,
.pio_in_emulated = emulator_pio_in_emulated,
.pio_out_emulated = emulator_pio_out_emulated,
.get_segment = emulator_get_segment,
.set_segment = emulator_set_segment,
.get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
.get_idt = emulator_get_idt,
.set_gdt = emulator_set_gdt,
.set_idt = emulator_set_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
.get_smbase = emulator_get_smbase,
.set_smbase = emulator_set_smbase,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
.read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
.pre_leave_smm = emulator_pre_leave_smm,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
* not, left the system with the INT_STI flag enabled, it
* means that the last instruction is an sti. We should not
* leave the flag on in this case. The same goes for mov ss
*/
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
}
static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
if (ctxt->exception.vector == PF_VECTOR)
return kvm_propagate_fault(vcpu, &ctxt->exception);
if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
ctxt->exception.error_code);
else
kvm_queue_exception(vcpu, ctxt->exception.vector);
return false;
}
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
ctxt->op_bytes = 2;
ctxt->ad_bytes = 2;
ctxt->_eip = ctxt->eip + inc_eip;
ret = emulate_int_real(ctxt, irq);
if (ret != X86EMUL_CONTINUE)
return EMULATE_FAIL;
ctxt->eip = ctxt->_eip;
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
return EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
int r = EMULATE_DONE;
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
return EMULATE_FAIL;
if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
r = EMULATE_USER_EXIT;
}
kvm_queue_exception(vcpu, UD_VECTOR);
return r;
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
bool write_fault_to_shadow_pgtable,
int emulation_type)
{
gpa_t gpa = cr2;
kvm_pfn_t pfn;
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
if (WARN_ON_ONCE(is_guest_mode(vcpu)))
return false;
if (!vcpu->arch.mmu->direct_map) {
/*
* Write permission should be allowed since only
* write access need to be emulated.
*/
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
/*
* If the mapping is invalid in guest, let cpu retry
* it to generate fault.
*/
if (gpa == UNMAPPED_GVA)
return true;
}
/*
* Do not retry the unhandleable instruction if it faults on the
* readonly host memory, otherwise it will goto a infinite loop:
* retry instruction -> write #PF -> emulation fail -> retry
* instruction -> ...
*/
pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
/*
* If the instruction failed on the error pfn, it can not be fixed,
* report the error to userspace.
*/
if (is_error_noslot_pfn(pfn))
return false;
kvm_release_pfn_clean(pfn);
/* The instructions are well-emulated on direct mmu. */
if (vcpu->arch.mmu->direct_map) {
unsigned int indirect_shadow_pages;
spin_lock(&vcpu->kvm->mmu_lock);
indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
spin_unlock(&vcpu->kvm->mmu_lock);
if (indirect_shadow_pages)
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
return true;
}
/*
* if emulation was due to access to shadowed page table
* and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
/*
* If the access faults on its page table, it can not
* be fixed by unprotecting shadow page and it should
* be reported to userspace.
*/
return !write_fault_to_shadow_pgtable;
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
unsigned long cr2, int emulation_type)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
last_retry_eip = vcpu->arch.last_retry_eip;
last_retry_addr = vcpu->arch.last_retry_addr;
/*
* If the emulation is caused by #PF and it is non-page_table
* writing instruction, it means the VM-EXIT is caused by shadow
* page protected, we can zap the shadow page and retry this
* instruction directly.
*
* Note: if the guest uses a non-page-table modifying instruction
* on the PDE that points to the instruction, then we will unmap
* the instruction and go to an infinite loop. So, we cache the
* last retried eip and the last fault address, if we meet the eip
* and the address again, we can break out of the potential infinite
* loop.
*/
vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
if (WARN_ON_ONCE(is_guest_mode(vcpu)))
return false;
if (x86_page_table_writing_insn(ctxt))
return false;
if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
return false;
vcpu->arch.last_retry_eip = ctxt->eip;
vcpu->arch.last_retry_addr = cr2;
if (!vcpu->arch.mmu->direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
return true;
}
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
static void kvm_smm_changed(struct kvm_vcpu *vcpu)
{
if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
/* This is a good place to trace that we are exiting SMM. */
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
/* Process a latched INIT or SMI, if any. */
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
kvm_mmu_reset_context(vcpu);
}
static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
{
unsigned changed = vcpu->arch.hflags ^ emul_flags;
vcpu->arch.hflags = emul_flags;
if (changed & HF_SMM_MASK)
kvm_smm_changed(vcpu);
}
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
u32 dr6 = 0;
int i;
u32 enable, rwlen;
enable = dr7;
rwlen = dr7 >> 16;
for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
dr6 |= (1 << i);
return dr6;
}
static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
} else {
kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
}
}
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
int r = EMULATE_DONE;
kvm_x86_ops->skip_emulated_instruction(vcpu);
/*
* rflags is the old, "raw" value of the flags. The new value has
* not been saved yet.
*
* This is correct even for TF set by the guest, because "the
* processor will not generate this exception after the instruction
* that sets the TF flag".
*/
if (unlikely(rflags & X86_EFLAGS_TF))
kvm_vcpu_do_singlestep(vcpu, &r);
return r == EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
{
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
struct kvm_run *kvm_run = vcpu->run;
unsigned long eip = kvm_get_linear_rip(vcpu);
u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.guest_debug_dr7,
vcpu->arch.eff_db);
if (dr6 != 0) {
kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
return true;
}
}
if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
!(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
unsigned long eip = kvm_get_linear_rip(vcpu);
u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
vcpu->arch.db);
if (dr6 != 0) {
vcpu->arch.dr6 &= ~15;
vcpu->arch.dr6 |= dr6 | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
*r = EMULATE_DONE;
return true;
}
}
return false;
}
static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
{
switch (ctxt->opcode_len) {
case 1:
switch (ctxt->b) {
case 0xe4: /* IN */
case 0xe5:
case 0xec:
case 0xed:
case 0xe6: /* OUT */
case 0xe7:
case 0xee:
case 0xef:
case 0x6c: /* INS */
case 0x6d:
case 0x6e: /* OUTS */
case 0x6f:
return true;
}
break;
case 2:
switch (ctxt->b) {
case 0x33: /* RDPMC */
return true;
}
break;
}
return false;
}
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
void *insn,
int insn_len)
{
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
bool writeback = true;
bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.l1tf_flush_l1d = true;
/*
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
*/
vcpu->arch.write_fault_to_shadow_pgtable = false;
kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
/*
* We will reenter on the same instruction since
* we do not set complete_userspace_io. This does not
* handle watchpoints yet, those would be handled in
* the emulate_ops.
*/
if (!(emulation_type & EMULTYPE_SKIP) &&
kvm_vcpu_check_breakpoint(vcpu, &r))
return r;
ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->exception.vector = -1;
ctxt->perm_ok = false;
ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
r = x86_decode_insn(ctxt, insn, insn_len);
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
if (r != EMULATION_OK) {
if (emulation_type & EMULTYPE_TRAP_UD)
return EMULATE_FAIL;
if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
emulation_type))
return EMULATE_DONE;
if (ctxt->have_exception && inject_emulated_exception(vcpu))
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
return handle_emulation_failure(vcpu, emulation_type);
}
}
if ((emulation_type & EMULTYPE_VMWARE) &&
!is_vmware_backdoor_opcode(ctxt))
return EMULATE_FAIL;
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
if (ctxt->eflags & X86_EFLAGS_RF)
kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return EMULATE_DONE;
}
if (retry_instruction(ctxt, cr2, emulation_type))
return EMULATE_DONE;
/* this is needed for vmware backdoor interface to work since it
changes registers values during IO operation */
if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
emulator_invalidate_register_cache(ctxt);
}
restart:
/* Save the faulting GPA (cr2) in the address field */
ctxt->exception.address = cr2;
r = x86_emulate_insn(ctxt);
if (r == EMULATION_INTERCEPTED)
return EMULATE_DONE;
if (r == EMULATION_FAILED) {
if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
emulation_type))
return EMULATE_DONE;
return handle_emulation_failure(vcpu, emulation_type);
}
if (ctxt->have_exception) {
r = EMULATE_DONE;
if (inject_emulated_exception(vcpu))
return r;
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in) {
/* FIXME: return into emulator if single-stepping. */
vcpu->arch.pio.count = 0;
} else {
writeback = false;
vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
r = EMULATE_USER_EXIT;
} else if (vcpu->mmio_needed) {
if (!vcpu->mmio_is_write)
writeback = false;
r = EMULATE_USER_EXIT;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = EMULATE_DONE;
if (writeback) {
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE && ctxt->tf)
kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
__kvm_set_rflags(vcpu, ctxt->eflags);
/*
* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
* do nothing, and it will be requested again as soon as
* the shadow expires. But we still need to check here,
* because POPF has no interrupt shadow.
*/
if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
kvm_make_request(KVM_REQ_EVENT, vcpu);
} else
vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
return r;
}
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
{
return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
void *insn, int insn_len)
{
return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
/* do not return to emulator after return from userspace */
vcpu->arch.pio.count = 0;
return ret;
}
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
{
unsigned long val;
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
/* For size less than 4 we merge, else we zero extend */
val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
: 0;
/*
* Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
* the copy and tracing
*/
emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
vcpu->arch.pio.port, &val, 1);
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
return 1;
}
static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
unsigned long val;
int ret;
/* For size less than 4 we merge, else we zero extend */
val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
&val, 1);
if (ret) {
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
return ret;
}
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
}
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
{
int ret = kvm_skip_emulated_instruction(vcpu);
/*
* TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
* KVM_EXIT_DEBUG here.
*/
if (in)
return kvm_fast_pio_in(vcpu, size, port) && ret;
else
return kvm_fast_pio_out(vcpu, size, port) && ret;
}
EXPORT_SYMBOL_GPL(kvm_fast_pio);
static int kvmclock_cpu_down_prep(unsigned int cpu)
{
__this_cpu_write(cpu_tsc_khz, 0);
return 0;
}
static void tsc_khz_changed(void *data)
{
struct cpufreq_freqs *freq = data;
unsigned long khz = 0;
if (data)
khz = freq->new;
else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz)
khz = tsc_khz;
__this_cpu_write(cpu_tsc_khz, khz);
}
#ifdef CONFIG_X86_64
static void kvm_hyperv_tsc_notifier(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int cpu;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_make_mclock_inprogress_request(kvm);
hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */
for_each_present_cpu(cpu)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
kvm_max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
struct kvm_arch *ka = &kvm->arch;
spin_lock(&ka->pvclock_gtod_sync_lock);
pvclock_update_vm_gtod_copy(kvm);
kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
spin_unlock(&ka->pvclock_gtod_sync_lock);
}
spin_unlock(&kvm_lock);
}
#endif
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
/*
* We allow guests to temporarily run on slowing clocks,
* provided we notify them after, or to run on accelerating
* clocks, provided we notify them before. Thus time never
* goes backwards.
*
* However, we have a problem. We can't atomically update
* the frequency of a given CPU from this function; it is
* merely a notifier, which can be called from any CPU.
* Changing the TSC frequency at arbitrary points in time
* requires a recomputation of local variables related to
* the TSC for each VCPU. We must flag these local variables
* to be updated and be sure the update takes place with the
* new frequency before any guests proceed.
*
* Unfortunately, the combination of hotplug CPU and frequency
* change creates an intractable locking scenario; the order
* of when these callouts happen is undefined with respect to
* CPU hotplug, and they can race with each other. As such,
* merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
* undefined; you can actually have a CPU frequency change take
* place in between the computation of X and the setting of the
* variable. To protect against this problem, all updates of
* the per_cpu tsc_khz variable are done in an interrupt
* protected IPI, and all callers wishing to update the value
* must wait for a synchronous IPI to complete (which is trivial
* if the caller is on the CPU already). This establishes the
* necessary total order on variable updates.
*
* Note that because a guest time update may take place
* anytime after the setting of the VCPU's request bit, the
* correct TSC value must be set before the request. However,
* to ensure the update actually makes it to any guest which
* starts running in hardware virtualization between the set
* and the acquisition of the spinlock, we must also ping the
* CPU after setting the request bit.
*
*/
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != smp_processor_id())
send_ipi = 1;
}
}
spin_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
* We upscale the frequency. Must make the guest
* doesn't see old kvmclock values while running with
* the new frequency, otherwise we risk the guest sees
* time go backwards.
*
* In case we update the frequency for another cpu
* (which might be in guest context) send an interrupt
* to kick the cpu out of guest context. Next time
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
}
return 0;
}
static struct notifier_block kvmclock_cpufreq_notifier_block = {
.notifier_call = kvmclock_cpufreq_notifier
};
static int kvmclock_cpu_online(unsigned int cpu)
{
tsc_khz_changed(NULL);
return 0;
}
static void kvm_timer_init(void)
{
max_tsc_khz = tsc_khz;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
int cpu;
memset(&policy, 0, sizeof(policy));
cpu = get_cpu();
cpufreq_get_policy(&policy, cpu);
if (policy.cpuinfo.max_freq)
max_tsc_khz = policy.cpuinfo.max_freq;
put_cpu();
#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
int kvm_is_in_guest(void)
{
return __this_cpu_read(current_vcpu) != NULL; /*covered*/
}
static int kvm_is_user_mode(void)
{
int user_mode = 3;
if (__this_cpu_read(current_vcpu))
user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
return user_mode != 0;
}
static unsigned long kvm_get_guest_ip(void)
{
unsigned long ip = 0;
if (__this_cpu_read(current_vcpu))
ip = kvm_rip_read(__this_cpu_read(current_vcpu));
return ip;
}
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
};
static void kvm_set_mmio_spte_mask(void)
{
u64 mask;
int maxphyaddr = boot_cpu_data.x86_phys_bits;
/*
* Set the reserved bits and the present bit of an paging-structure
* entry to generate page fault with PFER.RSV = 1.
*/
/*
* Mask the uppermost physical address bit, which would be reserved as
* long as the supported physical address width is less than 52.
*/
mask = 1ull << 51;
/* Set the present bit. */
mask |= 1ull;
/*
* If reserved bit is not supported, clear the present bit to disable
* mmio page fault.
*/
if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52)
mask &= ~1ull;
kvm_mmu_set_mmio_spte_mask(mask, mask);
}
#ifdef CONFIG_X86_64
static void pvclock_gtod_update_fn(struct work_struct *work)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
spin_unlock(&kvm_lock);
}
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
void *priv)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
struct timekeeper *tk = priv;
update_pvclock_gtod(tk);
/* disable master clock if host does not trust, or does not
* use, TSC based clocksource.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
queue_work(system_long_wq, &pvclock_gtod_work);
return 0;
}
static struct notifier_block pvclock_gtod_notifier = {
.notifier_call = pvclock_gtod_notify,
};
#endif
int kvm_arch_init(void *opaque)
{
int r;
struct kvm_x86_ops *ops = opaque;
if (kvm_x86_ops) {
printk(KERN_ERR "kvm: already loaded the other module\n");
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
printk(KERN_ERR "kvm: no hardware support\n");
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
printk(KERN_ERR "kvm: disabled by bios\n");
r = -EOPNOTSUPP;
goto out;
}
/*
* KVM explicitly assumes that the guest has an FPU and
* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
* vCPU's FPU state as a fxregs_state struct.
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
printk(KERN_ERR "kvm: inadequate fpu\n");
r = -EOPNOTSUPP;
goto out;
}
r = -ENOMEM;
x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
__alignof__(struct fpu), SLAB_ACCOUNT,
NULL);
if (!x86_fpu_cache) {
printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
goto out;
}
shared_msrs = alloc_percpu(struct kvm_shared_msrs);
if (!shared_msrs) {
printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
goto out_free_x86_fpu_cache;
}
r = kvm_mmu_module_init();
if (r)
goto out_free_percpu;
kvm_set_mmio_spte_mask();
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
if (boot_cpu_has(X86_FEATURE_XSAVE))
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_lapic_init();
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
return 0;
out_free_percpu:
free_percpu(shared_msrs);
out_free_x86_fpu_cache:
kmem_cache_destroy(x86_fpu_cache);
out:
return r;
}
void kvm_arch_exit(void)
{
#ifdef CONFIG_X86_64
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
clear_hv_tscchange_cb();
#endif
kvm_lapic_exit();
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
#endif
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
free_percpu(shared_msrs);
kmem_cache_destroy(x86_fpu_cache);
}
int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
vcpu->run->exit_reason = KVM_EXIT_HLT;
return 0;
}
}
EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
int ret = kvm_skip_emulated_instruction(vcpu);
/*
* TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
* KVM_EXIT_DEBUG here.
*/
return kvm_vcpu_halt(vcpu) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
#ifdef CONFIG_X86_64
static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
unsigned long clock_type)
{
struct kvm_clock_pairing clock_pairing;
struct timespec64 ts;
u64 cycle;
int ret;
if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
return -KVM_EOPNOTSUPP;
if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
return -KVM_EOPNOTSUPP;
clock_pairing.sec = ts.tv_sec;
clock_pairing.nsec = ts.tv_nsec;
clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
clock_pairing.flags = 0;
memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
ret = 0;
if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
sizeof(struct kvm_clock_pairing)))
ret = -KVM_EFAULT;
return ret;
}
#endif
/*
* kvm_pv_kick_cpu_op: Kick a vcpu.
*
* @apicid - apicid of vcpu to be kicked.
*/
static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
{
struct kvm_lapic_irq lapic_irq;
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
lapic_irq.level = 0;
lapic_irq.dest_id = apicid;
lapic_irq.msi_redir_hint = false;
lapic_irq.delivery_mode = APIC_DM_REMRD;
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
{
vcpu->arch.apicv_active = false;
kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
trace_kvm_hypercall(nr, a0, a1, a2, a3);
op_64_bit = is_64_bit_mode(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
a2 &= 0xFFFFFFFF;
a3 &= 0xFFFFFFFF;
}
if (kvm_x86_ops->get_cpl(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
case KVM_HC_KICK_CPU:
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
ret = 0;
break;
#ifdef CONFIG_X86_64
case KVM_HC_CLOCK_PAIRING:
ret = kvm_pv_clock_pairing(vcpu, a0, a1);
break;
#endif
case KVM_HC_SEND_IPI:
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
default:
ret = -KVM_ENOSYS;
break;
}
out:
if (!op_64_bit)
ret = (u32)ret;
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
}
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
return vcpu->run->request_interrupt_window &&
likely(!pic_in_kernel(vcpu->kvm));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
{
int max_irr, tpr;
if (!kvm_x86_ops->update_cr8_intercept)
return;
if (!lapic_in_kernel(vcpu))
return;
if (vcpu->arch.apicv_active)
return;
if (!vcpu->arch.apic->vapic_addr)
max_irr = kvm_lapic_find_highest_irr(vcpu);
else
max_irr = -1;
if (max_irr != -1)
max_irr >>= 4;
tpr = kvm_lapic_get_cr8(vcpu);
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
{
int r;
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected)
kvm_x86_ops->queue_exception(vcpu);
/*
* Do not inject an NMI or interrupt if there is a pending
* exception. Exceptions and interrupts are recognized at
* instruction boundaries, i.e. the start of an instruction.
* Trap-like exceptions, e.g. #DB, have higher priority than
* NMIs and interrupts, i.e. traps are recognized before an
* NMI/interrupt that's pending on the same instruction.
* Fault-like exceptions, e.g. #GP and #PF, are the lowest
* priority, but are only generated (pended) during instruction
* execution, i.e. a pending fault-like exception means the
* fault occurred on the *previous* instruction and must be
* serviced prior to recognizing any new events in order to
* fully complete the previous instruction.
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected)
kvm_x86_ops->set_nmi(vcpu);
else if (vcpu->arch.interrupt.injected)
kvm_x86_ops->set_irq(vcpu);
}
/*
* Call check_nested_events() even if we reinjected a previous event
* in order for caller to determine if it should require immediate-exit
* from L2 to L1 due to pending L1 events which require exit
* from L2 to L1.
*/
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
if (r != 0)
return r;
}
/* try to inject new event if pending */
if (vcpu->arch.exception.pending) {
trace_kvm_inj_exception(vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
WARN_ON_ONCE(vcpu->arch.exception.injected);
vcpu->arch.exception.pending = false;
vcpu->arch.exception.injected = true;
if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
if (vcpu->arch.exception.nr == DB_VECTOR) {
/*
* This code assumes that nSVM doesn't use
* check_nested_events(). If it does, the
* DR6/DR7 changes should happen before L1
* gets a #VMEXIT for an intercepted #DB in
* L2. (Under VMX, on the other hand, the
* DR6/DR7 changes should not happen in the
* event of a VM-exit to L1 for an intercepted
* #DB in L2.)
*/
kvm_deliver_exception_payload(vcpu);
if (vcpu->arch.dr7 & DR7_GD) {
vcpu->arch.dr7 &= ~DR7_GD;
kvm_update_dr7(vcpu);
}
}
kvm_x86_ops->queue_exception(vcpu);
}
/* Don't consider new event if we re-injected an event */
if (kvm_event_needs_reinjection(vcpu))
return 0;
if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
++vcpu->arch.smi_count;
enter_smm(vcpu);
} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
} else if (kvm_cpu_has_injectable_intr(vcpu)) {
/*
* Because interrupts can be injected asynchronously, we are
* calling check_nested_events again here to avoid a race condition.
* See https://lkml.org/lkml/2014/7/2/60 for discussion about this
* proposal and current concerns. Perhaps we should be setting
* KVM_REQ_EVENT only on certain events and not unconditionally?
*/
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
if (r != 0)
return r;
}
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
kvm_x86_ops->set_irq(vcpu);
}
}
return 0;
}
static void process_nmi(struct kvm_vcpu *vcpu)
{
unsigned limit = 2;
/*
* x86 is limited to one NMI running, and one NMI pending after it.
* If an NMI is already in progress, limit further NMIs to just one.
* Otherwise, allow two (and we'll inject the first one immediately).
*/
if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
flags |= seg->g << 23;
flags |= seg->db << 22;
flags |= seg->l << 21;
flags |= seg->avl << 20;
flags |= seg->present << 15;
flags |= seg->dpl << 13;
flags |= seg->s << 12;
flags |= seg->type << 8;
return flags;
}
static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
kvm_get_segment(vcpu, &seg, n);
put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
if (n < 3)
offset = 0x7f84 + n * 12;
else
offset = 0x7f2c + (n - 3) * 12;
put_smstate(u32, buf, offset + 8, seg.base);
put_smstate(u32, buf, offset + 4, seg.limit);
put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
}
#ifdef CONFIG_X86_64
static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
u16 flags;
kvm_get_segment(vcpu, &seg, n);
offset = 0x7e00 + n * 16;
flags = enter_smm_get_segment_flags(&seg) >> 8;
put_smstate(u16, buf, offset, seg.selector);
put_smstate(u16, buf, offset + 2, flags);
put_smstate(u32, buf, offset + 4, seg.limit);
put_smstate(u64, buf, offset + 8, seg.base);
}
#endif
static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
int i;
put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
for (i = 0; i < 8; i++)
put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
kvm_get_dr(vcpu, 6, &val);
put_smstate(u32, buf, 0x7fcc, (u32)val);
kvm_get_dr(vcpu, 7, &val);
put_smstate(u32, buf, 0x7fc8, (u32)val);
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u32, buf, 0x7fc4, seg.selector);
put_smstate(u32, buf, 0x7f64, seg.base);
put_smstate(u32, buf, 0x7f60, seg.limit);
put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u32, buf, 0x7fc0, seg.selector);
put_smstate(u32, buf, 0x7f80, seg.base);
put_smstate(u32, buf, 0x7f7c, seg.limit);
put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
put_smstate(u32, buf, 0x7f70, dt.size);
kvm_x86_ops->get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7f58, dt.address);
put_smstate(u32, buf, 0x7f54, dt.size);
for (i = 0; i < 6; i++)
enter_smm_save_seg_32(vcpu, buf, i);
put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
/* revision id */
put_smstate(u32, buf, 0x7efc, 0x00020000);
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
int i;
for (i = 0; i < 16; i++)
put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
kvm_get_dr(vcpu, 6, &val);
put_smstate(u64, buf, 0x7f68, val);
kvm_get_dr(vcpu, 7, &val);
put_smstate(u64, buf, 0x7f60, val);
put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
/* revision id */
put_smstate(u32, buf, 0x7efc, 0x00020064);
put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u16, buf, 0x7e90, seg.selector);
put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
kvm_x86_ops->get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7e84, dt.size);
put_smstate(u64, buf, 0x7e88, dt.address);
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u16, buf, 0x7e70, seg.selector);
put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7e64, dt.size);
put_smstate(u64, buf, 0x7e68, dt.address);
for (i = 0; i < 6; i++)
enter_smm_save_seg_64(vcpu, buf, i);
#else
WARN_ON_ONCE(1);
#endif
}
static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
char buf[512];
u32 cr0;
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
enter_smm_save_state_32(vcpu, buf);
/*
* Give pre_enter_smm() a chance to make ISA-specific changes to the
* vCPU state (e.g. leave guest mode) after we've saved the state into
* the SMM state-save area.
*/
kvm_x86_ops->pre_enter_smm(vcpu, buf);
vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (kvm_x86_ops->get_nmi_mask(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
kvm_x86_ops->set_nmi_mask(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
kvm_x86_ops->set_cr4(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
kvm_x86_ops->set_idt(vcpu, &dt);
__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
cs.base = vcpu->arch.smbase;
ds.selector = 0;
ds.base = 0;
cs.limit = ds.limit = 0xffffffff;
cs.type = ds.type = 0x3;
cs.dpl = ds.dpl = 0;
cs.db = ds.db = 0;
cs.s = ds.s = 1;
cs.l = ds.l = 0;
cs.g = ds.g = 1;
cs.avl = ds.avl = 0;
cs.present = ds.present = 1;
cs.unusable = ds.unusable = 0;
cs.padding = ds.padding = 0;
kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
}
static void process_smi(struct kvm_vcpu *vcpu)
{
vcpu->arch.smi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
if (!kvm_apic_present(vcpu))
return;
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
else
kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
}
static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
u64 eoi_exit_bitmap[4];
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
vcpu_to_synic(vcpu)->vec_bitmap, 256);
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end,
bool blockable)
{
unsigned long apic_address;
/*
* The physical address of apic access page is stored in the VMCS.
* Update it when it becomes invalid.
*/
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
return 0;
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
struct page *page = NULL;
if (!lapic_in_kernel(vcpu))
return;
if (!kvm_x86_ops->set_apic_access_page_addr)
return;
page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (is_error_page(page))
return;
kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
/*
* Do not pin apic access page in memory, the MMU notifier
* will call us again if it is migrated or swapped out.
*/
put_page(page);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
{
smp_send_reschedule(vcpu->cpu);
}
EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
* userspace.
*/
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
kvm_x86_ops->get_vmcs12_pages(vcpu);
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
kvm_gen_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
r = kvm_guest_time_update(vcpu);
if (unlikely(r))
goto out;
}
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
kvm_mmu_load_cr3(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_vcpu_flush_tlb(vcpu, true);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run-&g