Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d089989

Browse files
committed
RDMA/device: Provide APIs from the core code to help unregistration
These APIs are intended to support drivers that exist outside the usual driver core probe()/remove() callbacks. Normally the driver core will prevent remove() from running concurrently with probe(), once this safety is lost drivers need more support to get the locking and lifetimes right. ib_unregister_driver() is intended to be used during module_exit of a driver using these APIs. It unregisters all the associated ib_devices. ib_unregister_device_and_put() is to be used by a driver-specific removal function (ie removal by name, removal from a netdev notifier, removal from netlink) ib_unregister_queued() is to be used from netdev notifier chains where RTNL is held. The locking is tricky here since once things become async it is possible to race unregister with registration. This is largely solved by relying on the registration refcount, unregistration will only ever work on something that has a positive registration refcount - and then an unregistration mutex serializes all competing unregistrations of the same device. Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 4c173f5 commit d089989

File tree

2 files changed

+217
-36
lines changed

2 files changed

+217
-36
lines changed

drivers/infiniband/core/device.c

Lines changed: 206 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ static DEFINE_SPINLOCK(ndev_hash_lock);
140140
static DECLARE_HASHTABLE(ndev_hash, 5);
141141

142142
static void free_netdevs(struct ib_device *ib_dev);
143+
static void ib_unregister_work(struct work_struct *work);
144+
static void __ib_unregister_device(struct ib_device *device);
143145
static int ib_security_change(struct notifier_block *nb, unsigned long event,
144146
void *lsm_data);
145147
static void ib_policy_change_task(struct work_struct *work);
@@ -366,6 +368,7 @@ struct ib_device *_ib_alloc_device(size_t size)
366368

367369
INIT_LIST_HEAD(&device->event_handler_list);
368370
spin_lock_init(&device->event_handler_lock);
371+
mutex_init(&device->unregistration_lock);
369372
/*
370373
* client_data needs to be alloc because we don't want our mark to be
371374
* destroyed if the user stores NULL in the client data.
@@ -374,6 +377,7 @@ struct ib_device *_ib_alloc_device(size_t size)
374377
init_rwsem(&device->client_data_rwsem);
375378
INIT_LIST_HEAD(&device->port_list);
376379
init_completion(&device->unreg_completion);
380+
INIT_WORK(&device->unregistration_work, ib_unregister_work);
377381

378382
return device;
379383
}
@@ -387,6 +391,20 @@ EXPORT_SYMBOL(_ib_alloc_device);
387391
*/
388392
void ib_dealloc_device(struct ib_device *device)
389393
{
394+
if (device->ops.dealloc_driver)
395+
device->ops.dealloc_driver(device);
396+
397+
/*
398+
* ib_unregister_driver() requires all devices to remain in the xarray
399+
* while their ops are callable. The last op we call is dealloc_driver
400+
* above. This is needed to create a fence on op callbacks prior to
401+
* allowing the driver module to unload.
402+
*/
403+
down_write(&devices_rwsem);
404+
if (xa_load(&devices, device->index) == device)
405+
xa_erase(&devices, device->index);
406+
up_write(&devices_rwsem);
407+
390408
/* Expedite releasing netdev references */
391409
free_netdevs(device);
392410

@@ -599,7 +617,8 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
599617
}
600618

601619
/*
602-
* Assign the unique string device name and the unique device index.
620+
* Assign the unique string device name and the unique device index. This is
621+
* undone by ib_dealloc_device.
603622
*/
604623
static int assign_name(struct ib_device *device, const char *name)
605624
{
@@ -640,13 +659,6 @@ static int assign_name(struct ib_device *device, const char *name)
640659
return ret;
641660
}
642661

643-
static void release_name(struct ib_device *device)
644-
{
645-
down_write(&devices_rwsem);
646-
xa_erase(&devices, device->index);
647-
up_write(&devices_rwsem);
648-
}
649-
650662
static void setup_dma_device(struct ib_device *device)
651663
{
652664
struct device *parent = device->dev.parent;
@@ -740,30 +752,38 @@ static void disable_device(struct ib_device *device)
740752

741753
/*
742754
* An enabled device is visible to all clients and to all the public facing
743-
* APIs that return a device pointer.
755+
* APIs that return a device pointer. This always returns with a new get, even
756+
* if it fails.
744757
*/
745-
static int enable_device(struct ib_device *device)
758+
static int enable_device_and_get(struct ib_device *device)
746759
{
747760
struct ib_client *client;
748761
unsigned long index;
749-
int ret;
762+
int ret = 0;
750763

751-
refcount_set(&device->refcount, 1);
764+
/*
765+
* One ref belongs to the xa and the other belongs to this
766+
* thread. This is needed to guard against parallel unregistration.
767+
*/
768+
refcount_set(&device->refcount, 2);
752769
down_write(&devices_rwsem);
753770
xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
754-
up_write(&devices_rwsem);
771+
772+
/*
773+
* By using downgrade_write() we ensure that no other thread can clear
774+
* DEVICE_REGISTERED while we are completing the client setup.
775+
*/
776+
downgrade_write(&devices_rwsem);
755777

756778
down_read(&clients_rwsem);
757779
xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
758780
ret = add_client_context(device, client);
759-
if (ret) {
760-
up_read(&clients_rwsem);
761-
disable_device(device);
762-
return ret;
763-
}
781+
if (ret)
782+
break;
764783
}
765784
up_read(&clients_rwsem);
766-
return 0;
785+
up_read(&devices_rwsem);
786+
return ret;
767787
}
768788

769789
/**
@@ -774,6 +794,10 @@ static int enable_device(struct ib_device *device)
774794
* devices with the IB core. All registered clients will receive a
775795
* callback for each device that is added. @device must be allocated
776796
* with ib_alloc_device().
797+
*
798+
* If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
799+
* asynchronously then the device pointer may become freed as soon as this
800+
* function returns.
777801
*/
778802
int ib_register_device(struct ib_device *device, const char *name)
779803
{
@@ -785,13 +809,13 @@ int ib_register_device(struct ib_device *device, const char *name)
785809

786810
ret = setup_device(device);
787811
if (ret)
788-
goto out;
812+
return ret;
789813

790814
ret = ib_cache_setup_one(device);
791815
if (ret) {
792816
dev_warn(&device->dev,
793817
"Couldn't set up InfiniBand P_Key/GID cache\n");
794-
goto out;
818+
return ret;
795819
}
796820

797821
ib_device_register_rdmacg(device);
@@ -807,42 +831,186 @@ int ib_register_device(struct ib_device *device, const char *name)
807831
goto dev_cleanup;
808832
}
809833

810-
ret = enable_device(device);
811-
if (ret)
812-
goto sysfs_cleanup;
834+
ret = enable_device_and_get(device);
835+
if (ret) {
836+
void (*dealloc_fn)(struct ib_device *);
837+
838+
/*
839+
* If we hit this error flow then we don't want to
840+
* automatically dealloc the device since the caller is
841+
* expected to call ib_dealloc_device() after
842+
* ib_register_device() fails. This is tricky due to the
843+
* possibility for a parallel unregistration along with this
844+
* error flow. Since we have a refcount here we know any
845+
* parallel flow is stopped in disable_device and will see the
846+
* NULL pointers, causing the responsibility to
847+
* ib_dealloc_device() to revert back to this thread.
848+
*/
849+
dealloc_fn = device->ops.dealloc_driver;
850+
device->ops.dealloc_driver = NULL;
851+
ib_device_put(device);
852+
__ib_unregister_device(device);
853+
device->ops.dealloc_driver = dealloc_fn;
854+
return ret;
855+
}
856+
ib_device_put(device);
813857

814858
return 0;
815859

816-
sysfs_cleanup:
817-
ib_device_unregister_sysfs(device);
818860
dev_cleanup:
819861
device_del(&device->dev);
820862
cg_cleanup:
821863
ib_device_unregister_rdmacg(device);
822864
ib_cache_cleanup_one(device);
823-
out:
824-
release_name(device);
825865
return ret;
826866
}
827867
EXPORT_SYMBOL(ib_register_device);
828868

869+
/* Callers must hold a get on the device. */
870+
static void __ib_unregister_device(struct ib_device *ib_dev)
871+
{
872+
/*
873+
* We have a registration lock so that all the calls to unregister are
874+
* fully fenced, once any unregister returns the device is truely
875+
* unregistered even if multiple callers are unregistering it at the
876+
* same time. This also interacts with the registration flow and
877+
* provides sane semantics if register and unregister are racing.
878+
*/
879+
mutex_lock(&ib_dev->unregistration_lock);
880+
if (!refcount_read(&ib_dev->refcount))
881+
goto out;
882+
883+
disable_device(ib_dev);
884+
ib_device_unregister_sysfs(ib_dev);
885+
device_del(&ib_dev->dev);
886+
ib_device_unregister_rdmacg(ib_dev);
887+
ib_cache_cleanup_one(ib_dev);
888+
889+
/*
890+
* Drivers using the new flow may not call ib_dealloc_device except
891+
* in error unwind prior to registration success.
892+
*/
893+
if (ib_dev->ops.dealloc_driver) {
894+
WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
895+
ib_dealloc_device(ib_dev);
896+
}
897+
out:
898+
mutex_unlock(&ib_dev->unregistration_lock);
899+
}
900+
829901
/**
830902
* ib_unregister_device - Unregister an IB device
831-
* @device:Device to unregister
903+
* @device: The device to unregister
832904
*
833905
* Unregister an IB device. All clients will receive a remove callback.
906+
*
907+
* Callers should call this routine only once, and protect against races with
908+
* registration. Typically it should only be called as part of a remove
909+
* callback in an implementation of driver core's struct device_driver and
910+
* related.
911+
*
912+
* If ops.dealloc_driver is used then ib_dev will be freed upon return from
913+
* this function.
834914
*/
835-
void ib_unregister_device(struct ib_device *device)
915+
void ib_unregister_device(struct ib_device *ib_dev)
836916
{
837-
disable_device(device);
838-
ib_device_unregister_sysfs(device);
839-
device_del(&device->dev);
840-
ib_device_unregister_rdmacg(device);
841-
ib_cache_cleanup_one(device);
842-
release_name(device);
917+
get_device(&ib_dev->dev);
918+
__ib_unregister_device(ib_dev);
919+
put_device(&ib_dev->dev);
843920
}
844921
EXPORT_SYMBOL(ib_unregister_device);
845922

923+
/**
924+
* ib_unregister_device_and_put - Unregister a device while holding a 'get'
925+
* device: The device to unregister
926+
*
927+
* This is the same as ib_unregister_device(), except it includes an internal
928+
* ib_device_put() that should match a 'get' obtained by the caller.
929+
*
930+
* It is safe to call this routine concurrently from multiple threads while
931+
* holding the 'get'. When the function returns the device is fully
932+
* unregistered.
933+
*
934+
* Drivers using this flow MUST use the driver_unregister callback to clean up
935+
* their resources associated with the device and dealloc it.
936+
*/
937+
void ib_unregister_device_and_put(struct ib_device *ib_dev)
938+
{
939+
WARN_ON(!ib_dev->ops.dealloc_driver);
940+
get_device(&ib_dev->dev);
941+
ib_device_put(ib_dev);
942+
__ib_unregister_device(ib_dev);
943+
put_device(&ib_dev->dev);
944+
}
945+
EXPORT_SYMBOL(ib_unregister_device_and_put);
946+
947+
/**
948+
* ib_unregister_driver - Unregister all IB devices for a driver
949+
* @driver_id: The driver to unregister
950+
*
951+
* This implements a fence for device unregistration. It only returns once all
952+
* devices associated with the driver_id have fully completed their
953+
* unregistration and returned from ib_unregister_device*().
954+
*
955+
* If device's are not yet unregistered it goes ahead and starts unregistering
956+
* them.
957+
*
958+
* This does not block creation of new devices with the given driver_id, that
959+
* is the responsibility of the caller.
960+
*/
961+
void ib_unregister_driver(enum rdma_driver_id driver_id)
962+
{
963+
struct ib_device *ib_dev;
964+
unsigned long index;
965+
966+
down_read(&devices_rwsem);
967+
xa_for_each (&devices, index, ib_dev) {
968+
if (ib_dev->driver_id != driver_id)
969+
continue;
970+
971+
get_device(&ib_dev->dev);
972+
up_read(&devices_rwsem);
973+
974+
WARN_ON(!ib_dev->ops.dealloc_driver);
975+
__ib_unregister_device(ib_dev);
976+
977+
put_device(&ib_dev->dev);
978+
down_read(&devices_rwsem);
979+
}
980+
up_read(&devices_rwsem);
981+
}
982+
EXPORT_SYMBOL(ib_unregister_driver);
983+
984+
static void ib_unregister_work(struct work_struct *work)
985+
{
986+
struct ib_device *ib_dev =
987+
container_of(work, struct ib_device, unregistration_work);
988+
989+
__ib_unregister_device(ib_dev);
990+
put_device(&ib_dev->dev);
991+
}
992+
993+
/**
994+
* ib_unregister_device_queued - Unregister a device using a work queue
995+
* device: The device to unregister
996+
*
997+
* This schedules an asynchronous unregistration using a WQ for the device. A
998+
* driver should use this to avoid holding locks while doing unregistration,
999+
* such as holding the RTNL lock.
1000+
*
1001+
* Drivers using this API must use ib_unregister_driver before module unload
1002+
* to ensure that all scheduled unregistrations have completed.
1003+
*/
1004+
void ib_unregister_device_queued(struct ib_device *ib_dev)
1005+
{
1006+
WARN_ON(!refcount_read(&ib_dev->refcount));
1007+
WARN_ON(!ib_dev->ops.dealloc_driver);
1008+
get_device(&ib_dev->dev);
1009+
if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
1010+
put_device(&ib_dev->dev);
1011+
}
1012+
EXPORT_SYMBOL(ib_unregister_device_queued);
1013+
8461014
static int assign_client_id(struct ib_client *client)
8471015
{
8481016
int ret;
@@ -1558,6 +1726,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
15581726
SET_DEVICE_OP(dev_ops, create_srq);
15591727
SET_DEVICE_OP(dev_ops, create_wq);
15601728
SET_DEVICE_OP(dev_ops, dealloc_dm);
1729+
SET_DEVICE_OP(dev_ops, dealloc_driver);
15611730
SET_DEVICE_OP(dev_ops, dealloc_fmr);
15621731
SET_DEVICE_OP(dev_ops, dealloc_mw);
15631732
SET_DEVICE_OP(dev_ops, dealloc_pd);
@@ -1744,6 +1913,7 @@ static void __exit ib_core_cleanup(void)
17441913
destroy_workqueue(ib_comp_wq);
17451914
/* Make sure that any pending umem accounting work is done. */
17461915
destroy_workqueue(ib_wq);
1916+
flush_workqueue(system_unbound_wq);
17471917
WARN_ON(!xa_empty(&clients));
17481918
WARN_ON(!xa_empty(&devices));
17491919
}

0 commit comments

Comments
 (0)