summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCorey Minyard <corey@minyard.net>2025-08-20 14:56:50 -0500
committerCorey Minyard <corey@minyard.net>2025-09-08 10:21:41 -0500
commitbc3a9d217755f65c137f145600f23bf1d6c31ea9 (patch)
treead1265f7d9728ba90feacc2fb20c1043eacb4451
parent3bc54ab3b9790ca92f197e9822e486665daa321c (diff)
ipmi:si: Gracefully handle if the BMC is non-functional
If the BMC is not functional, the driver goes into an error state and starts a 1 second timer. When the timer times out, it will attempt a simple message. If the BMC interacts correctly, the driver will start accepting messages again. If not, it remains in error state. If the driver goes into error state, all messages current and pending will return with an error. This should more gracefully handle when the BMC becomes non-operational, as opposed to trying each messages individually and failing them. Signed-off-by: Corey Minyard <corey@minyard.net>
-rw-r--r--drivers/char/ipmi/ipmi_si_intf.c29
1 files changed, 23 insertions, 6 deletions
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 88d62ef71b1b..1c65275906b4 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -53,6 +53,7 @@
#define SI_TIMEOUT_JIFFIES (SI_TIMEOUT_TIME_USEC/SI_USEC_PER_JIFFY)
#define SI_SHORT_TIMEOUT_USEC 250 /* .25ms when the SM request a
short timeout */
+#define SI_TIMEOUT_HOSED (HZ) /* 1 second when in hosed state. */
enum si_intf_state {
SI_NORMAL,
@@ -61,7 +62,8 @@ enum si_intf_state {
SI_CLEARING_FLAGS,
SI_GETTING_MESSAGES,
SI_CHECKING_ENABLES,
- SI_SETTING_ENABLES
+ SI_SETTING_ENABLES,
+ SI_HOSED
/* FIXME - add watchdog stuff. */
};
@@ -753,6 +755,8 @@ static void handle_transaction_done(struct smi_info *smi_info)
}
break;
}
+ case SI_HOSED: /* Shouldn't happen. */
+ break;
}
}
@@ -767,6 +771,10 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info,
enum si_sm_result si_sm_result;
restart:
+ if (smi_info->si_state == SI_HOSED)
+ /* Just in case, hosed state is only left from the timeout. */
+ return SI_SM_HOSED;
+
/*
* There used to be a loop here that waited a little while
* (around 25us) before giving up. That turned out to be
@@ -790,18 +798,20 @@ restart:
/*
* Do the before return_hosed_msg, because that
- * releases the lock.
+ * releases the lock. We just disable operations for
+ * a while and retry in hosed state.
*/
- smi_info->si_state = SI_NORMAL;
+ smi_info->si_state = SI_HOSED;
if (smi_info->curr_msg != NULL) {
/*
* If we were handling a user message, format
* a response to send to the upper layer to
* tell it about the error.
*/
- return_hosed_msg(smi_info, IPMI_ERR_UNSPECIFIED);
+ return_hosed_msg(smi_info, IPMI_BUS_ERR);
}
- goto restart;
+ smi_mod_timer(smi_info, jiffies + SI_TIMEOUT_HOSED);
+ goto out;
}
/*
@@ -899,7 +909,7 @@ static void flush_messages(void *send_info)
* mode. This means we are single-threaded, no need for locks.
*/
result = smi_event_handler(smi_info, 0);
- while (result != SI_SM_IDLE) {
+ while (result != SI_SM_IDLE && result != SI_SM_HOSED) {
udelay(SI_SHORT_TIMEOUT_USEC);
result = smi_event_handler(smi_info, SI_SHORT_TIMEOUT_USEC);
}
@@ -912,6 +922,9 @@ static int sender(void *send_info, struct ipmi_smi_msg *msg)
debug_timestamp(smi_info, "Enqueue");
+ if (smi_info->si_state == SI_HOSED)
+ return IPMI_BUS_ERR;
+
if (smi_info->run_to_completion) {
/*
* If we are running to completion, start it. Upper
@@ -1092,6 +1105,10 @@ static void smi_timeout(struct timer_list *t)
spin_lock_irqsave(&(smi_info->si_lock), flags);
debug_timestamp(smi_info, "Timer");
+ if (smi_info->si_state == SI_HOSED)
+ /* Try something to see if the BMC is now operational. */
+ start_get_flags(smi_info);
+
jiffies_now = jiffies;
time_diff = (((long)jiffies_now - (long)smi_info->last_timeout_jiffies)
* SI_USEC_PER_JIFFY);