Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 38b9f90

Browse files
Moshe Shemeshkuba-moo
authored andcommitted
net/mlx5: Handle sync reset request event
Once the driver gets sync_reset_request from firmware it prepares for the coming reset and sends acknowledge. After getting this event the driver expects device reset, either it will trigger PCI reset on sync_reset_now event or such PCI reset will be triggered by another PF of the same device. So it moves to reset requested mode and if it gets PCI reset triggered by the other PF it detect the reset and reloads. Signed-off-by: Moshe Shemesh <[email protected]> Reviewed-by: Saeed Mahameed <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent e7f4d0b commit 38b9f90

File tree

6 files changed

+220
-15
lines changed

6 files changed

+220
-15
lines changed

drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,20 @@
33

44
#include "fw_reset.h"
55

6+
enum {
7+
MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
8+
};
9+
10+
struct mlx5_fw_reset {
11+
struct mlx5_core_dev *dev;
12+
struct mlx5_nb nb;
13+
struct workqueue_struct *wq;
14+
struct work_struct reset_request_work;
15+
struct work_struct reset_reload_work;
16+
unsigned long reset_flags;
17+
struct timer_list timer;
18+
};
19+
620
static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level,
721
u8 reset_type_sel, u8 sync_resp, bool sync_start)
822
{
@@ -49,3 +63,167 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
4963
{
5064
return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
5165
}
66+
67+
static void mlx5_sync_reset_reload_work(struct work_struct *work)
68+
{
69+
struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
70+
reset_reload_work);
71+
struct mlx5_core_dev *dev = fw_reset->dev;
72+
73+
mlx5_enter_error_state(dev, true);
74+
mlx5_unload_one(dev, false);
75+
if (mlx5_health_wait_pci_up(dev)) {
76+
mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
77+
return;
78+
}
79+
mlx5_load_one(dev, false);
80+
}
81+
82+
static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
83+
{
84+
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
85+
86+
del_timer(&fw_reset->timer);
87+
}
88+
89+
static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health)
90+
{
91+
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
92+
93+
mlx5_stop_sync_reset_poll(dev);
94+
clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
95+
if (poll_health)
96+
mlx5_start_health_poll(dev);
97+
}
98+
99+
#define MLX5_RESET_POLL_INTERVAL (HZ / 10)
100+
static void poll_sync_reset(struct timer_list *t)
101+
{
102+
struct mlx5_fw_reset *fw_reset = from_timer(fw_reset, t, timer);
103+
struct mlx5_core_dev *dev = fw_reset->dev;
104+
u32 fatal_error;
105+
106+
if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags))
107+
return;
108+
109+
fatal_error = mlx5_health_check_fatal_sensors(dev);
110+
111+
if (fatal_error) {
112+
mlx5_core_warn(dev, "Got Device Reset\n");
113+
mlx5_sync_reset_clear_reset_requested(dev, false);
114+
queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
115+
return;
116+
}
117+
118+
mod_timer(&fw_reset->timer, round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL));
119+
}
120+
121+
static void mlx5_start_sync_reset_poll(struct mlx5_core_dev *dev)
122+
{
123+
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
124+
125+
timer_setup(&fw_reset->timer, poll_sync_reset, 0);
126+
fw_reset->timer.expires = round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL);
127+
add_timer(&fw_reset->timer);
128+
}
129+
130+
static int mlx5_fw_reset_set_reset_sync_ack(struct mlx5_core_dev *dev)
131+
{
132+
return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 1, false);
133+
}
134+
135+
static void mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev)
136+
{
137+
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
138+
139+
mlx5_stop_health_poll(dev, true);
140+
set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
141+
mlx5_start_sync_reset_poll(dev);
142+
}
143+
144+
static void mlx5_sync_reset_request_event(struct work_struct *work)
145+
{
146+
struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
147+
reset_request_work);
148+
struct mlx5_core_dev *dev = fw_reset->dev;
149+
int err;
150+
151+
mlx5_sync_reset_set_reset_requested(dev);
152+
err = mlx5_fw_reset_set_reset_sync_ack(dev);
153+
if (err)
154+
mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err);
155+
else
156+
mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n");
157+
}
158+
159+
static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct mlx5_eqe *eqe)
160+
{
161+
struct mlx5_eqe_sync_fw_update *sync_fw_update_eqe;
162+
u8 sync_event_rst_type;
163+
164+
sync_fw_update_eqe = &eqe->data.sync_fw_update;
165+
sync_event_rst_type = sync_fw_update_eqe->sync_rst_state & SYNC_RST_STATE_MASK;
166+
switch (sync_event_rst_type) {
167+
case MLX5_SYNC_RST_STATE_RESET_REQUEST:
168+
queue_work(fw_reset->wq, &fw_reset->reset_request_work);
169+
break;
170+
}
171+
}
172+
173+
static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data)
174+
{
175+
struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb);
176+
struct mlx5_eqe *eqe = data;
177+
178+
switch (eqe->sub_type) {
179+
case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT:
180+
mlx5_sync_reset_events_handle(fw_reset, eqe);
181+
break;
182+
default:
183+
return NOTIFY_DONE;
184+
}
185+
186+
return NOTIFY_OK;
187+
}
188+
189+
void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev)
190+
{
191+
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
192+
193+
MLX5_NB_INIT(&fw_reset->nb, fw_reset_event_notifier, GENERAL_EVENT);
194+
mlx5_eq_notifier_register(dev, &fw_reset->nb);
195+
}
196+
197+
void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev)
198+
{
199+
mlx5_eq_notifier_unregister(dev, &dev->priv.fw_reset->nb);
200+
}
201+
202+
int mlx5_fw_reset_init(struct mlx5_core_dev *dev)
203+
{
204+
struct mlx5_fw_reset *fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL);
205+
206+
if (!fw_reset)
207+
return -ENOMEM;
208+
fw_reset->wq = create_singlethread_workqueue("mlx5_fw_reset_events");
209+
if (!fw_reset->wq) {
210+
kfree(fw_reset);
211+
return -ENOMEM;
212+
}
213+
214+
fw_reset->dev = dev;
215+
dev->priv.fw_reset = fw_reset;
216+
217+
INIT_WORK(&fw_reset->reset_request_work, mlx5_sync_reset_request_event);
218+
INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work);
219+
220+
return 0;
221+
}
222+
223+
void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev)
224+
{
225+
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
226+
227+
destroy_workqueue(fw_reset->wq);
228+
kfree(dev->priv.fw_reset);
229+
}

drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,9 @@ int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_ty
1010
int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel);
1111
int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
1212

13+
void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);
14+
void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev);
15+
int mlx5_fw_reset_init(struct mlx5_core_dev *dev);
16+
void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev);
17+
1318
#endif

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
110110
return rfr && synd;
111111
}
112112

113-
static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
113+
u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev)
114114
{
115115
if (sensor_pci_not_working(dev))
116116
return MLX5_SENSOR_PCI_COMM_ERR;
@@ -173,7 +173,7 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
173173
* Check again to avoid a redundant 2nd reset. If the fatal erros was
174174
* PCI related a reset won't help.
175175
*/
176-
fatal_error = check_fatal_sensors(dev);
176+
fatal_error = mlx5_health_check_fatal_sensors(dev);
177177
if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
178178
fatal_error == MLX5_SENSOR_NIC_DISABLED ||
179179
fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
@@ -195,7 +195,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
195195
bool err_detected = false;
196196

197197
/* Mark the device as fatal in order to abort FW commands */
198-
if ((check_fatal_sensors(dev) || force) &&
198+
if ((mlx5_health_check_fatal_sensors(dev) || force) &&
199199
dev->state == MLX5_DEVICE_STATE_UP) {
200200
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
201201
err_detected = true;
@@ -208,7 +208,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
208208
goto unlock;
209209
}
210210

211-
if (check_fatal_sensors(dev) || force) { /* protected state setting */
211+
if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
212212
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
213213
mlx5_cmd_flush(dev);
214214
}
@@ -231,7 +231,7 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
231231

232232
mlx5_core_err(dev, "start\n");
233233

234-
if (check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
234+
if (mlx5_health_check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
235235
/* Get cr-dump and reset FW semaphore */
236236
lock = lock_sem_sw_reset(dev, true);
237237

@@ -308,26 +308,31 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
308308

309309
/* How much time to wait until health resetting the driver (in msecs) */
310310
#define MLX5_RECOVERY_WAIT_MSECS 60000
311-
static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
311+
int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev)
312312
{
313313
unsigned long end;
314314

315-
mlx5_core_warn(dev, "handling bad device here\n");
316-
mlx5_handle_bad_state(dev);
317315
end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
318316
while (sensor_pci_not_working(dev)) {
319-
if (time_after(jiffies, end)) {
320-
mlx5_core_err(dev,
321-
"health recovery flow aborted, PCI reads still not working\n");
322-
return -EIO;
323-
}
317+
if (time_after(jiffies, end))
318+
return -ETIMEDOUT;
324319
msleep(100);
325320
}
321+
return 0;
322+
}
326323

324+
static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
325+
{
326+
mlx5_core_warn(dev, "handling bad device here\n");
327+
mlx5_handle_bad_state(dev);
328+
if (mlx5_health_wait_pci_up(dev)) {
329+
mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n");
330+
return -EIO;
331+
}
327332
mlx5_core_err(dev, "starting health recovery flow\n");
328333
mlx5_recover_device(dev);
329334
if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
330-
check_fatal_sensors(dev)) {
335+
mlx5_health_check_fatal_sensors(dev)) {
331336
mlx5_core_err(dev, "health recovery failed\n");
332337
return -EIO;
333338
}
@@ -696,7 +701,7 @@ static void poll_health(struct timer_list *t)
696701
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
697702
goto out;
698703

699-
fatal_error = check_fatal_sensors(dev);
704+
fatal_error = mlx5_health_check_fatal_sensors(dev);
700705

701706
if (fatal_error && !health->fatal_error) {
702707
mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include "lib/mpfs.h"
5858
#include "eswitch.h"
5959
#include "devlink.h"
60+
#include "fw_reset.h"
6061
#include "lib/mlx5.h"
6162
#include "fpga/core.h"
6263
#include "fpga/ipsec.h"
@@ -835,6 +836,12 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
835836
goto err_eq_cleanup;
836837
}
837838

839+
err = mlx5_fw_reset_init(dev);
840+
if (err) {
841+
mlx5_core_err(dev, "failed to initialize fw reset events\n");
842+
goto err_events_cleanup;
843+
}
844+
838845
mlx5_cq_debugfs_init(dev);
839846

840847
mlx5_init_reserved_gids(dev);
@@ -896,6 +903,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
896903
mlx5_geneve_destroy(dev->geneve);
897904
mlx5_vxlan_destroy(dev->vxlan);
898905
mlx5_cq_debugfs_cleanup(dev);
906+
mlx5_fw_reset_cleanup(dev);
907+
err_events_cleanup:
899908
mlx5_events_cleanup(dev);
900909
err_eq_cleanup:
901910
mlx5_eq_table_cleanup(dev);
@@ -923,6 +932,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
923932
mlx5_cleanup_clock(dev);
924933
mlx5_cleanup_reserved_gids(dev);
925934
mlx5_cq_debugfs_cleanup(dev);
935+
mlx5_fw_reset_cleanup(dev);
926936
mlx5_events_cleanup(dev);
927937
mlx5_eq_table_cleanup(dev);
928938
mlx5_irq_table_cleanup(dev);
@@ -1081,6 +1091,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
10811091
goto err_fw_tracer;
10821092
}
10831093

1094+
mlx5_fw_reset_events_start(dev);
10841095
mlx5_hv_vhca_init(dev->hv_vhca);
10851096

10861097
err = mlx5_rsc_dump_init(dev);
@@ -1142,6 +1153,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
11421153
mlx5_rsc_dump_cleanup(dev);
11431154
err_rsc_dump:
11441155
mlx5_hv_vhca_cleanup(dev->hv_vhca);
1156+
mlx5_fw_reset_events_stop(dev);
11451157
mlx5_fw_tracer_cleanup(dev->tracer);
11461158
err_fw_tracer:
11471159
mlx5_eq_table_destroy(dev);
@@ -1164,6 +1176,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
11641176
mlx5_fpga_device_stop(dev);
11651177
mlx5_rsc_dump_cleanup(dev);
11661178
mlx5_hv_vhca_cleanup(dev->hv_vhca);
1179+
mlx5_fw_reset_events_stop(dev);
11671180
mlx5_fw_tracer_cleanup(dev->tracer);
11681181
mlx5_eq_table_destroy(dev);
11691182
mlx5_irq_table_destroy(dev);

drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
128128
int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev);
129129
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
130130
void mlx5_error_sw_reset(struct mlx5_core_dev *dev);
131+
u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev);
132+
int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev);
131133
void mlx5_disable_device(struct mlx5_core_dev *dev);
132134
void mlx5_recover_device(struct mlx5_core_dev *dev);
133135
int mlx5_sriov_init(struct mlx5_core_dev *dev);

include/linux/mlx5/driver.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ struct mlx5_mpfs;
501501
struct mlx5_eswitch;
502502
struct mlx5_lag;
503503
struct mlx5_devcom;
504+
struct mlx5_fw_reset;
504505
struct mlx5_eq_table;
505506
struct mlx5_irq_table;
506507

@@ -578,6 +579,7 @@ struct mlx5_priv {
578579
struct mlx5_core_sriov sriov;
579580
struct mlx5_lag *lag;
580581
struct mlx5_devcom *devcom;
582+
struct mlx5_fw_reset *fw_reset;
581583
struct mlx5_core_roce roce;
582584
struct mlx5_fc_stats fc_stats;
583585
struct mlx5_rl_table rl_table;

0 commit comments

Comments
 (0)