[PATCH V3 01/10] acpi: apei: read ack upon ghes record consumption
Baicar, Tyler
tbaicar at codeaurora.org
Thu Oct 13 06:49:00 PDT 2016
Hello Punit,
Thank you for the feedback! Responses below
On 10/12/2016 9:39 AM, Punit Agrawal wrote:
> Hi Tyler,
>
> A few comments below.
>
> Tyler Baicar <tbaicar at codeaurora.org> writes:
>
>> A RAS (Reliability, Availability, Serviceability) controller
>> may be a separate processor running in parallel with OS
>> execution, and may generate error records for consumption by
>> the OS. If the RAS controller produces multiple error records,
>> then they may be overwritten before the OS has consumed them.
>>
>> The Generic Hardware Error Source (GHES) v2 structure
>> introduces the capability for the OS to acknowledge the
>> consumption of the error record generated by the RAS
>> controller. A RAS controller supporting GHESv2 shall wait for
>> the acknowledgment before writing a new error record, thus
>> eliminating the race condition.
>>
>> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang at codeaurora.org>
>> Signed-off-by: Richard Ruigrok <rruigrok at codeaurora.org>
>> Signed-off-by: Tyler Baicar <tbaicar at codeaurora.org>
>> Signed-off-by: Naveen Kaje <nkaje at codeaurora.org>
>> ---
>> drivers/acpi/apei/ghes.c | 41 +++++++++++++++++++++++++++++++++++++++++
>> drivers/acpi/apei/hest.c | 7 +++++--
>> include/acpi/ghes.h | 1 +
>> 3 files changed, 47 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 60746ef..3021f0e 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -45,6 +45,7 @@
>> #include <linux/aer.h>
>> #include <linux/nmi.h>
>>
>> +#include <acpi/actbl1.h>
>> #include <acpi/ghes.h>
>> #include <acpi/apei.h>
>> #include <asm/tlbflush.h>
>> @@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>> struct ghes *ghes;
>> unsigned int error_block_length;
>> int rc;
>> + struct acpi_hest_header *hest_hdr;
>>
>> ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
>> if (!ghes)
>> return ERR_PTR(-ENOMEM);
>> +
>> + hest_hdr = (struct acpi_hest_header *)generic;
>> + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) {
>> + ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic;
>> + rc = apei_map_generic_address(
>> + &ghes->generic_v2->read_ack_register);
>> + if (rc)
>> + goto err_unmap;
>> + } else
>> + ghes->generic_v2 = NULL;
> Since you kzalloc ghes, shouldn't ghes->generic_v2 be NULL already?
Yes, the documentation says kzalloc returns memory set to zero, so I
will remove this else statement.
>> +
>> ghes->generic = generic;
>> rc = apei_map_generic_address(&generic->error_status_address);
>> if (rc)
>> @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>>
>> err_unmap:
>> apei_unmap_generic_address(&generic->error_status_address);
>> + if (ghes->generic_v2)
>> + apei_unmap_generic_address(
>> + &ghes->generic_v2->read_ack_register);
>> err_free:
>> kfree(ghes);
>> return ERR_PTR(rc);
>> @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes)
>> {
>> kfree(ghes->estatus);
>> apei_unmap_generic_address(&ghes->generic->error_status_address);
>> + if (ghes->generic_v2)
>> + apei_unmap_generic_address(
>> + &ghes->generic_v2->read_ack_register);
>> }
>>
>> static inline int ghes_severity(int severity)
>> @@ -648,6 +667,22 @@ static void ghes_estatus_cache_add(
>> rcu_read_unlock();
>> }
>>
>> +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)
>> +{
>> + int rc;
>> + u64 val = 0;
>> +
>> + rc = apei_read(&val, &generic_v2->read_ack_register);
>> + if (rc)
>> + return rc;
>> + val &= generic_v2->read_ack_preserve <<
>> + generic_v2->read_ack_register.bit_offset;
>> + val |= generic_v2->read_ack_write;
> Reading the spec, it is not clear whether you need the left shift
> above.
>
> Having said that, if you do it for read_ack_preserve, do you also need
> to left shift read_ack_write by read_ack_register.bit_offset?
Good catch, it looks like the read_ack_write should also get this shift.
I'm using a shift of 0 so I didn't catch this in testing :)
>> + rc = apei_write(val, &generic_v2->read_ack_register);
>> +
>> + return rc;
>> +}
>> +
>> static int ghes_proc(struct ghes *ghes)
>> {
>> int rc;
>> @@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes)
>> ghes_estatus_cache_add(ghes->generic, ghes->estatus);
>> }
>> ghes_do_proc(ghes, ghes->estatus);
>> +
>> + if (ghes->generic_v2) {
>> + rc = ghes_do_read_ack(ghes->generic_v2);
>> + if (rc)
>> + return rc;
>> + }
>> out:
>> ghes_clear_estatus(ghes);
>> return 0;
>> diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
>> index 792a0d9..ef725a9 100644
>> --- a/drivers/acpi/apei/hest.c
>> +++ b/drivers/acpi/apei/hest.c
>> @@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
>> [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
>> [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
>> [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
>> + [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
>> };
>>
>> static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
>> @@ -146,7 +147,8 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void
>> {
>> int *count = data;
>>
>> - if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR)
>> + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
>> + hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2)
>> (*count)++;
>> return 0;
>> }
>> @@ -157,7 +159,8 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data)
>> struct ghes_arr *ghes_arr = data;
>> int rc, i;
>>
>> - if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
>> + if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR &&
>> + hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2)
>> return 0;
>>
>> if (!((struct acpi_hest_generic *)hest_hdr)->enabled)
>> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
>> index 720446c..d0108b6 100644
>> --- a/include/acpi/ghes.h
>> +++ b/include/acpi/ghes.h
>> @@ -14,6 +14,7 @@
>>
>> struct ghes {
>> struct acpi_hest_generic *generic;
>> + struct acpi_hest_generic_v2 *generic_v2;
> You either have a GHES or a GHESv2 structure. Instead of duplication,
> could this be represented as a union?
>
> Thanks,
> Punit
I think that should be doable. I'll make these changes in the next version.
Thanks,
Tyler
>> struct acpi_hest_generic_status *estatus;
>> u64 buffer_paddr;
>> unsigned long flags;
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
More information about the linux-arm-kernel
mailing list