@@ -140,6 +140,8 @@ static DEFINE_SPINLOCK(ndev_hash_lock);
140
140
static DECLARE_HASHTABLE (ndev_hash , 5 ) ;
141
141
142
142
static void free_netdevs (struct ib_device * ib_dev );
143
+ static void ib_unregister_work (struct work_struct * work );
144
+ static void __ib_unregister_device (struct ib_device * device );
143
145
static int ib_security_change (struct notifier_block * nb , unsigned long event ,
144
146
void * lsm_data );
145
147
static void ib_policy_change_task (struct work_struct * work );
@@ -366,6 +368,7 @@ struct ib_device *_ib_alloc_device(size_t size)
366
368
367
369
INIT_LIST_HEAD (& device -> event_handler_list );
368
370
spin_lock_init (& device -> event_handler_lock );
371
+ mutex_init (& device -> unregistration_lock );
369
372
/*
370
373
* client_data needs to be alloc because we don't want our mark to be
371
374
* destroyed if the user stores NULL in the client data.
@@ -374,6 +377,7 @@ struct ib_device *_ib_alloc_device(size_t size)
374
377
init_rwsem (& device -> client_data_rwsem );
375
378
INIT_LIST_HEAD (& device -> port_list );
376
379
init_completion (& device -> unreg_completion );
380
+ INIT_WORK (& device -> unregistration_work , ib_unregister_work );
377
381
378
382
return device ;
379
383
}
@@ -387,6 +391,20 @@ EXPORT_SYMBOL(_ib_alloc_device);
387
391
*/
388
392
void ib_dealloc_device (struct ib_device * device )
389
393
{
394
+ if (device -> ops .dealloc_driver )
395
+ device -> ops .dealloc_driver (device );
396
+
397
+ /*
398
+ * ib_unregister_driver() requires all devices to remain in the xarray
399
+ * while their ops are callable. The last op we call is dealloc_driver
400
+ * above. This is needed to create a fence on op callbacks prior to
401
+ * allowing the driver module to unload.
402
+ */
403
+ down_write (& devices_rwsem );
404
+ if (xa_load (& devices , device -> index ) == device )
405
+ xa_erase (& devices , device -> index );
406
+ up_write (& devices_rwsem );
407
+
390
408
/* Expedite releasing netdev references */
391
409
free_netdevs (device );
392
410
@@ -599,7 +617,8 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
599
617
}
600
618
601
619
/*
602
- * Assign the unique string device name and the unique device index.
620
+ * Assign the unique string device name and the unique device index. This is
621
+ * undone by ib_dealloc_device.
603
622
*/
604
623
static int assign_name (struct ib_device * device , const char * name )
605
624
{
@@ -640,13 +659,6 @@ static int assign_name(struct ib_device *device, const char *name)
640
659
return ret ;
641
660
}
642
661
643
- static void release_name (struct ib_device * device )
644
- {
645
- down_write (& devices_rwsem );
646
- xa_erase (& devices , device -> index );
647
- up_write (& devices_rwsem );
648
- }
649
-
650
662
static void setup_dma_device (struct ib_device * device )
651
663
{
652
664
struct device * parent = device -> dev .parent ;
@@ -740,30 +752,38 @@ static void disable_device(struct ib_device *device)
740
752
741
753
/*
742
754
* An enabled device is visible to all clients and to all the public facing
743
- * APIs that return a device pointer.
755
+ * APIs that return a device pointer. This always returns with a new get, even
756
+ * if it fails.
744
757
*/
745
- static int enable_device (struct ib_device * device )
758
+ static int enable_device_and_get (struct ib_device * device )
746
759
{
747
760
struct ib_client * client ;
748
761
unsigned long index ;
749
- int ret ;
762
+ int ret = 0 ;
750
763
751
- refcount_set (& device -> refcount , 1 );
764
+ /*
765
+ * One ref belongs to the xa and the other belongs to this
766
+ * thread. This is needed to guard against parallel unregistration.
767
+ */
768
+ refcount_set (& device -> refcount , 2 );
752
769
down_write (& devices_rwsem );
753
770
xa_set_mark (& devices , device -> index , DEVICE_REGISTERED );
754
- up_write (& devices_rwsem );
771
+
772
+ /*
773
+ * By using downgrade_write() we ensure that no other thread can clear
774
+ * DEVICE_REGISTERED while we are completing the client setup.
775
+ */
776
+ downgrade_write (& devices_rwsem );
755
777
756
778
down_read (& clients_rwsem );
757
779
xa_for_each_marked (& clients , index , client , CLIENT_REGISTERED ) {
758
780
ret = add_client_context (device , client );
759
- if (ret ) {
760
- up_read (& clients_rwsem );
761
- disable_device (device );
762
- return ret ;
763
- }
781
+ if (ret )
782
+ break ;
764
783
}
765
784
up_read (& clients_rwsem );
766
- return 0 ;
785
+ up_read (& devices_rwsem );
786
+ return ret ;
767
787
}
768
788
769
789
/**
@@ -774,6 +794,10 @@ static int enable_device(struct ib_device *device)
774
794
* devices with the IB core. All registered clients will receive a
775
795
* callback for each device that is added. @device must be allocated
776
796
* with ib_alloc_device().
797
+ *
798
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
799
+ * asynchronously then the device pointer may become freed as soon as this
800
+ * function returns.
777
801
*/
778
802
int ib_register_device (struct ib_device * device , const char * name )
779
803
{
@@ -785,13 +809,13 @@ int ib_register_device(struct ib_device *device, const char *name)
785
809
786
810
ret = setup_device (device );
787
811
if (ret )
788
- goto out ;
812
+ return ret ;
789
813
790
814
ret = ib_cache_setup_one (device );
791
815
if (ret ) {
792
816
dev_warn (& device -> dev ,
793
817
"Couldn't set up InfiniBand P_Key/GID cache\n" );
794
- goto out ;
818
+ return ret ;
795
819
}
796
820
797
821
ib_device_register_rdmacg (device );
@@ -807,42 +831,186 @@ int ib_register_device(struct ib_device *device, const char *name)
807
831
goto dev_cleanup ;
808
832
}
809
833
810
- ret = enable_device (device );
811
- if (ret )
812
- goto sysfs_cleanup ;
834
+ ret = enable_device_and_get (device );
835
+ if (ret ) {
836
+ void (* dealloc_fn )(struct ib_device * );
837
+
838
+ /*
839
+ * If we hit this error flow then we don't want to
840
+ * automatically dealloc the device since the caller is
841
+ * expected to call ib_dealloc_device() after
842
+ * ib_register_device() fails. This is tricky due to the
843
+ * possibility for a parallel unregistration along with this
844
+ * error flow. Since we have a refcount here we know any
845
+ * parallel flow is stopped in disable_device and will see the
846
+ * NULL pointers, causing the responsibility to
847
+ * ib_dealloc_device() to revert back to this thread.
848
+ */
849
+ dealloc_fn = device -> ops .dealloc_driver ;
850
+ device -> ops .dealloc_driver = NULL ;
851
+ ib_device_put (device );
852
+ __ib_unregister_device (device );
853
+ device -> ops .dealloc_driver = dealloc_fn ;
854
+ return ret ;
855
+ }
856
+ ib_device_put (device );
813
857
814
858
return 0 ;
815
859
816
- sysfs_cleanup :
817
- ib_device_unregister_sysfs (device );
818
860
dev_cleanup :
819
861
device_del (& device -> dev );
820
862
cg_cleanup :
821
863
ib_device_unregister_rdmacg (device );
822
864
ib_cache_cleanup_one (device );
823
- out :
824
- release_name (device );
825
865
return ret ;
826
866
}
827
867
EXPORT_SYMBOL (ib_register_device );
828
868
869
+ /* Callers must hold a get on the device. */
870
+ static void __ib_unregister_device (struct ib_device * ib_dev )
871
+ {
872
+ /*
873
+ * We have a registration lock so that all the calls to unregister are
874
+ * fully fenced, once any unregister returns the device is truely
875
+ * unregistered even if multiple callers are unregistering it at the
876
+ * same time. This also interacts with the registration flow and
877
+ * provides sane semantics if register and unregister are racing.
878
+ */
879
+ mutex_lock (& ib_dev -> unregistration_lock );
880
+ if (!refcount_read (& ib_dev -> refcount ))
881
+ goto out ;
882
+
883
+ disable_device (ib_dev );
884
+ ib_device_unregister_sysfs (ib_dev );
885
+ device_del (& ib_dev -> dev );
886
+ ib_device_unregister_rdmacg (ib_dev );
887
+ ib_cache_cleanup_one (ib_dev );
888
+
889
+ /*
890
+ * Drivers using the new flow may not call ib_dealloc_device except
891
+ * in error unwind prior to registration success.
892
+ */
893
+ if (ib_dev -> ops .dealloc_driver ) {
894
+ WARN_ON (kref_read (& ib_dev -> dev .kobj .kref ) <= 1 );
895
+ ib_dealloc_device (ib_dev );
896
+ }
897
+ out :
898
+ mutex_unlock (& ib_dev -> unregistration_lock );
899
+ }
900
+
829
901
/**
830
902
* ib_unregister_device - Unregister an IB device
831
- * @device:Device to unregister
903
+ * @device: The device to unregister
832
904
*
833
905
* Unregister an IB device. All clients will receive a remove callback.
906
+ *
907
+ * Callers should call this routine only once, and protect against races with
908
+ * registration. Typically it should only be called as part of a remove
909
+ * callback in an implementation of driver core's struct device_driver and
910
+ * related.
911
+ *
912
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
913
+ * this function.
834
914
*/
835
- void ib_unregister_device (struct ib_device * device )
915
+ void ib_unregister_device (struct ib_device * ib_dev )
836
916
{
837
- disable_device (device );
838
- ib_device_unregister_sysfs (device );
839
- device_del (& device -> dev );
840
- ib_device_unregister_rdmacg (device );
841
- ib_cache_cleanup_one (device );
842
- release_name (device );
917
+ get_device (& ib_dev -> dev );
918
+ __ib_unregister_device (ib_dev );
919
+ put_device (& ib_dev -> dev );
843
920
}
844
921
EXPORT_SYMBOL (ib_unregister_device );
845
922
923
+ /**
924
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
925
+ * device: The device to unregister
926
+ *
927
+ * This is the same as ib_unregister_device(), except it includes an internal
928
+ * ib_device_put() that should match a 'get' obtained by the caller.
929
+ *
930
+ * It is safe to call this routine concurrently from multiple threads while
931
+ * holding the 'get'. When the function returns the device is fully
932
+ * unregistered.
933
+ *
934
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
935
+ * their resources associated with the device and dealloc it.
936
+ */
937
+ void ib_unregister_device_and_put (struct ib_device * ib_dev )
938
+ {
939
+ WARN_ON (!ib_dev -> ops .dealloc_driver );
940
+ get_device (& ib_dev -> dev );
941
+ ib_device_put (ib_dev );
942
+ __ib_unregister_device (ib_dev );
943
+ put_device (& ib_dev -> dev );
944
+ }
945
+ EXPORT_SYMBOL (ib_unregister_device_and_put );
946
+
947
+ /**
948
+ * ib_unregister_driver - Unregister all IB devices for a driver
949
+ * @driver_id: The driver to unregister
950
+ *
951
+ * This implements a fence for device unregistration. It only returns once all
952
+ * devices associated with the driver_id have fully completed their
953
+ * unregistration and returned from ib_unregister_device*().
954
+ *
955
+ * If device's are not yet unregistered it goes ahead and starts unregistering
956
+ * them.
957
+ *
958
+ * This does not block creation of new devices with the given driver_id, that
959
+ * is the responsibility of the caller.
960
+ */
961
+ void ib_unregister_driver (enum rdma_driver_id driver_id )
962
+ {
963
+ struct ib_device * ib_dev ;
964
+ unsigned long index ;
965
+
966
+ down_read (& devices_rwsem );
967
+ xa_for_each (& devices , index , ib_dev ) {
968
+ if (ib_dev -> driver_id != driver_id )
969
+ continue ;
970
+
971
+ get_device (& ib_dev -> dev );
972
+ up_read (& devices_rwsem );
973
+
974
+ WARN_ON (!ib_dev -> ops .dealloc_driver );
975
+ __ib_unregister_device (ib_dev );
976
+
977
+ put_device (& ib_dev -> dev );
978
+ down_read (& devices_rwsem );
979
+ }
980
+ up_read (& devices_rwsem );
981
+ }
982
+ EXPORT_SYMBOL (ib_unregister_driver );
983
+
984
+ static void ib_unregister_work (struct work_struct * work )
985
+ {
986
+ struct ib_device * ib_dev =
987
+ container_of (work , struct ib_device , unregistration_work );
988
+
989
+ __ib_unregister_device (ib_dev );
990
+ put_device (& ib_dev -> dev );
991
+ }
992
+
993
+ /**
994
+ * ib_unregister_device_queued - Unregister a device using a work queue
995
+ * device: The device to unregister
996
+ *
997
+ * This schedules an asynchronous unregistration using a WQ for the device. A
998
+ * driver should use this to avoid holding locks while doing unregistration,
999
+ * such as holding the RTNL lock.
1000
+ *
1001
+ * Drivers using this API must use ib_unregister_driver before module unload
1002
+ * to ensure that all scheduled unregistrations have completed.
1003
+ */
1004
+ void ib_unregister_device_queued (struct ib_device * ib_dev )
1005
+ {
1006
+ WARN_ON (!refcount_read (& ib_dev -> refcount ));
1007
+ WARN_ON (!ib_dev -> ops .dealloc_driver );
1008
+ get_device (& ib_dev -> dev );
1009
+ if (!queue_work (system_unbound_wq , & ib_dev -> unregistration_work ))
1010
+ put_device (& ib_dev -> dev );
1011
+ }
1012
+ EXPORT_SYMBOL (ib_unregister_device_queued );
1013
+
846
1014
static int assign_client_id (struct ib_client * client )
847
1015
{
848
1016
int ret ;
@@ -1558,6 +1726,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
1558
1726
SET_DEVICE_OP (dev_ops , create_srq );
1559
1727
SET_DEVICE_OP (dev_ops , create_wq );
1560
1728
SET_DEVICE_OP (dev_ops , dealloc_dm );
1729
+ SET_DEVICE_OP (dev_ops , dealloc_driver );
1561
1730
SET_DEVICE_OP (dev_ops , dealloc_fmr );
1562
1731
SET_DEVICE_OP (dev_ops , dealloc_mw );
1563
1732
SET_DEVICE_OP (dev_ops , dealloc_pd );
@@ -1744,6 +1913,7 @@ static void __exit ib_core_cleanup(void)
1744
1913
destroy_workqueue (ib_comp_wq );
1745
1914
/* Make sure that any pending umem accounting work is done. */
1746
1915
destroy_workqueue (ib_wq );
1916
+ flush_workqueue (system_unbound_wq );
1747
1917
WARN_ON (!xa_empty (& clients ));
1748
1918
WARN_ON (!xa_empty (& devices ));
1749
1919
}
0 commit comments