˜uCUDA¿1Æ·»{
p∗
O¯˘E˘§E˘
` 'J4«CUDA¢y¿1Æ·»{" §'O˜uDijkstra! Bellman-Ford! ∆-
Stepping!Sparse Matrix-Vector Bellman-Ford"'˜kØu†;DijkstraBellman-Ford'O?1
¿1zU?§Øc5UZ∆-Stepping¿1{¢yCUDA†U?§J
˜uSparse Matrix-VectorBellman-Ford§'O¢yCSRELLCUDA{" 3¢'§
CUDA{c5U‘Boost¥¥«Æ·»{?1’§y†'{3m
mkr¿§3Œ5U‘‡†w"
’c Æ·»§Dijkstra§Bellman-Ford, ∆-Stepping§Sparse Matrix-Vector, CSR Bellman-Ford,
ELL Bellman-Ford, CUDA§GPU§¿1?§
1 {0
Cc5§¿1O•u—"NVIDIA!Microsoft!Intelœi'O
gC¿1O†" 32011c§A⁄kO¯))l! )P! “
¯!O¯^ı?n"CUˆ¯⁄ı?n#¿
|"~X§NVIDIA32010cØ£˜†Tegra 2V?n"
D¥?nŒ?n6Y’§ª/?n£GPU⁄¿1O3C
Ac⁄#Vg" @ˇGPU?§4¡§§SU–«n/
“\Œ§,§S\S“kX" GPUˆ{?n2:Œ
" N`4¡" §SI˘SOpenGL‰DirectX" ˇd§@ˇGPUO¿
61"
32006c§NVIDIAœ1–˜uCUDAe£Compute Unified Device Archi-
tecture⁄GPU))GeForce 8800 GTX" CUDAeGPUOO«#
‹§ƒ U 3 ˇ ^ O ¥ B" T e ƒGPUU ) ß E , O fl K§§
„CUDA-8e£ISA⁄–9GPUS¿1O" mu
g2007c–5§–CUDA C˜:A^§S§…O4J," ~
X˘ª! O6N˜˘! ‚˘+–ˇLGPU¿1O5
UJ,"'ØuÆ·»flK§J˜uCUDA†¿1{§ØuG1
{§3?J,"
'{e'|Xe" 12o(L’uÆ·»§’
«{‘" 130˜uDijkstra{CUDA" 140Bellman-
FordCUDA¿1{" 315¥§£ª∆-Stepping{CUDAU?" 16J
˜uSparse Matrix-VectorBellman-Ford¿1{" 3¢'§’
ª4«¿1{Boost¥¥G1¿1Æ·»{5U"
2 ’
kªG = (V, E)§¥|V | = n§|E| = m" 4s ∈ V L!:" z^
>e ∈ EDK›§^›…ŒL«c : E → R"3øp§•‰´^·
»›·»¥⁄k>›" Øu:Æ·»flK·Ol:s8I
!:v›^·»"
ıŒÆ·»{ocsÆ·»Œ|d§d(v)Llsv›§
¿{zg?1t£relax⁄#d" {m'§d(s) ← 0, d(v) ← ∞" zg
t^>e(v, w) ∈ E§rd(w)d(w)d(v) + c(v, w) ¥§vk!:I
#§{(" XJsv§K{(d(v) = ∞" ˜ud#“§ı
ŒÆ·»{'«label-settinglabel-correcting"Label-setting{£~X
Dijkstra⁄zgØÆ·»fi†(‰!:v’>?1t§⁄–Label-settingI
tm^>"Label-correcting{£~XBellman-Ford⁄zgØu(‰Æ·»
!:v‹?1t§⁄–L§UtLm^>"
Cherkassky [32]o(c«Æ·»{" ¥§†;:
Æ·»{)Dijkstra [1, 14, 19]! Bellman-Ford [5, 14, 23]! –9˜uØu“…Œ
A [6, 15, 7]" ˜uø†;{U?„ı" ~XØuDijkstraU?
)˜uMin-Max Heap [18]! Pairing Heap [20]! Fibonacci Heap [21]{" 5
‘ø{J‘kŁ£Q⁄¥s·»Æ!:vmE,O(logn)§u
·{NmE,O(nlogn + m)" ,§ [24] [25]'OØBellman-FordJ
U ? {" d uA { ·Dijkstra z § h(x) = 0(Ø u “ … Œ
h(x) ≤ d(x, y) + h(y))§AzDijkstra{" AA^Ø2§~XiZ
¥ˇ·{§IS˘" A{’3uØu“…Œh(x)O"
3flK¥§!:m›§vkı&E§Øu“…ŒJO§⁄
–A••flK"
Cc§Æ·»¿1{§~XCrauser [30]!Eager [31]!∆-Stepping [17]
J" ¥§Crauser5U’›‰§Io3‘k٧Ømƒ§
Øuz!:o›" EagerKI\lookaheadºŒ§^5zg
ˇØl3lookaheadS!:" lookahead‹{¿15§KH{g
Œ‹\¿Nı" ØulookaheadJK·ˇª" ⁄–Eager
{˙6ulookahead J§¿›‰" ∆-Stepping·¿1Æ·»{¥˙
p" §ºŒ∆§3{1igt§ˇØd›3i × ∆(i + 1) × ∆
m!:t" [29] [27, 8]gØ∆-Stepping{?1¢y" c·˜u9th
DIMACSCray MTA-2.¯AO‘z٧ˆ{3˚ˇ?1t" —'d(s) ← 0§d(v) ← ∞§¥v = s" {
ı1n − 1t–Ø⁄k!:vsÆ·»§mE,O(n2)"
Algorithm 1 CUDA Dijkstra
(a) Initial:
foreach v ∈ V do
d[v] ← ∞
end
d[s] ← 0
Q ← V
while Q = φ do
(b) ExtractMin:
u ← {v : v ∈ Q ∧ ∀w ∈ Q, d(v) ≤ d(w)}
if d[u] = ∞ then
break
remove u from Q
(c) Relax:
foreach (u, v) ∈ E do
if d[u] + c(u, v) < d[v] then
d[v] ← d[u] + c(u, v)
end
end
Algorithm1CUDA Dijkstra{e"¥§Initial!ExtractMin!Relax
U¿11" InitialmE,O(n/t)§ExtractMinO(n2/t)§RelaxO(m/t)§
¥tL«…Œ¥§(thread)oŒ"⁄–{mE,O(n/t+n2/t+m/t) =
O(n2/t)"
3.2 Reduction
3¿1?§¥§Reduction [13, 10]·Nı¿1˜" ØuExtractMin§•
–^Reduction5\" Reduction˜g·Œ–'£{?
n§zg~5§˛(^{(" n§t ≥ n§
ExtractMin mE,O(logn)§duŒmˇ&flK§¢S$
œ"
3 G 1 Dijkstra { ¥ExtractMinI H { Œ | g§ m E , O(n)" Ø
uDijkstra O(logn) U ? { [18, 20, 21] · ˜ u ‘ k Ł ¢ yExtractMin
§ ‘ k Ł Œ ( ¿ • ¿ 1 {§ ⁄ – • ^ Reduction { ¢
yExtractMin" ‘:3u?§N·§ØuDijkstra'{Œ(U?=
"o⁄$§Im?1‘kŁ;"JmE,U
3O(logn)"
3 [10]¥07«Reduction{§ØReduction?/?1«‘z"
•J—«§¿ØThreadsPerBlock(TPB)ºŒ?15U’"
^Thrust¥ [11]¥Reduction! CPULoopHeap?1’" lª1–w§
3Œ5107§256! 512! 1024TPB’Loop(-O2‘z) fl§fl512TPB\
’1.65" Thrust\’·6.07" †‘zLoop$k56790nsı"
ReductionØuHeap·œ§ø·du§mˇ&§ŒD·flKE
⁄" –*§Œ5§Loop‘‡§Œ5CReduction‘
‡§⁄–¿1?§•Œ"
1
10
102
128(TPB)
40.82
35.92
39.63
256(TPB)
48.56
57.37
43.96
512(TPB)
42.64
47.32
41.2
1024(TPB)
43.81
43.76
45.52
103
43.12
43.05
41.55
49.77
104
59.4
50.49
45.97
46.74
105
106
107
145.4
1033.26
8033.73
112.43
556.43
4839.28
92.04
470.487
4075.55
108.75
734.59
6475.09
Thrust
324.8
65.97
73.04
513.12
453.91
450.91
Loop
Loop(-O2)
Heap
0
0
0
0
0
0
0
0
0
0
0
0
150
630
0
0
0
0
498.5
5620
630
1105.1
56790
6710
0.124
0.1311
L 1: ExtractMin Time(ns)"Œ5l1107§?11000gExtractMin†"
4 CUDA Bellman-Ford
4.1 Bellman-Ford{
Bellman-Ford{uDijkstra§§%g·Øª¥z^>ØA!:?1
t§vk!:I#§Æulabel-correcting{"–y†ıtn − 1
{(§mE,O(nm)"
Algorithm 2 CUDA Bellman-Ford
(a) Initial:
foreach v ∈ V do
d[v] ← ∞
end
d[s] ← 0
for i ← 1 to |V | − 1 do
(b) Relax:
foreach (u, v) ∈ E do
if d[u] + c(u, v) < d[v] then
d[v] ← d[u] + c(u, v)
end
end
Algorithm2{e§InitialRelax'Dijkstra" §
•3{¥^«ØBellman-Ford~^U?^Łzg#!:"3e
gRelaxL§¥JŁ˜!:§¿Ø’>?1t§Ł" 5
‘un − 1{U(§⁄–{mE,uO(nm)" {Øu>t
E,O(m/t)§⁄–¿1Bellman-Ford{E,O(n ∗ (m/t)) = O(nm/t)"
4.2 Ł
Bellman-Ford†;¢y·|^ŁIt!:"ø«Ł¢y
‹’{fl§t!:Œun§ªtfi†("GPU¥
ŁIv–eA:1⁄fl\!J"mE,3O(1)"2⁄GPUŁ
¥UflE£CPU¥?1"
˜u–:§•ƒ^٧¿dŁ\"S¥" øQy
GPU3O(1)m\§CPUU3O(1)mJ" ŁŒ(·
nŒ|"¥kheadtail" tail⁄·aqW!
:§Œ§⁄–Ł¥ın − 1Œ"head = tail§Ł"duŒ
|n§⁄–zg#pp ← (p + 1) mod n"ŁUØ—/|^–
c!:¢m#!:ƒ^"øŁØumƒ§Ł?§E
,§Ø\!U3O(1)⁄§Ø•GPU¥Ł¢y"3'¥§
ØGPUŁ??1U?" du¿1{§flı§ØuŁ‹)N
ıˇ&!flK§‡7$˙" ⁄–§{Ł¢yŒ|§zBlockØ
A٧?J,{˙"
4.3 Large Label LastSmall Label First‘z
[26]JØuBellman-FordLarge Label Last(LLL)Small Label First(SLF)‘
z" LLLŁ˜1§Łn§davg = 1
v=1 d(v)" edist(i) > davgKi\
n
٧Øe§Ø,iƒd(i) ≤ x§Ki Ł?1t"
SLF\\!:·j§Ł˜i§ed(j) < d(i)§Kj\Ł˜§˜K\Ł
"SLF ƒJp15%20%§SLF + LLL Jp50%"
n
5 CUDA ∆-Stepping
5.1 ∆-Stepping{
∆-Stepping [17]˜uDijkstra?1¿1U?"G1Dijkstra·§31i
{ˇØd(v)3i × ∆(i + 1) × ∆mv?1t"∆-Stepping^Bucket[i]51i
{It!:§Bucket = φ{(" dutgŒ(‰5§⁄–∆-
SteppingÆuLabel-correcting{" ∆߉zgBucket8§⁄–∆J⁄
{¿1zgt˙Kˇf§3¡•‹?Ø∆?1?"
Algorithm3CUDA ∆-Stepping{e§{¥Initial!Add To Request!
Relax–¿11"
5.2
Initial
E¥z^>·Æ§⁄–Øu>—'z–¿11§heavy(v)light(v)
—'z3O(m/t)mS⁄" 3'{¥§duS! w–9?§B
|5ˇ§Initial¢y·3Relax¥¿¢y§3…Œ¥=c(v, w) >
∆‰c(v, w) ≤ ∆5(‰ØA>a." ª~§–Øulightheavy⁄^
Sm§duheavylight3—'z;/)§mˇ&!
flK–¿/" 3relax¥z§Øc(v, w)⁄O\mا⁄
–{N‹$"
5.3 Add To Request
{§Req(v, x):ا¥vLI#!:§x·#l"
duReq·(‰§Æ0§m" XJ3§S—'‹2 × m
£vx⁄SReq§uª§mL" ,§Req5(
ØuO··¶§IosizeC5Req§mk
Algorithm 3 CUDA ∆-Stepping
(a) Initial:
foreach v ∈ V do
heavy(v) ← {(v, w) ∈ E : c(v, w) > ∆}
light(v) ← {(v, w) ∈ E : c(v, w) ≤ ∆}
d(v) ← ∞
end
relax(s, 0)
i ← 0
while B = φ do
S ← φ
while B[i] = φ do
(b1) Add To Request:
Req ← {(w, d(v) + c(v, w)) : v ∈ B[i] ∧ (v, w) ∈ light(v)}
S ← S ∪ B[i]
B[i] ← φ
(c1) Relax:
foreach (v, x) ∈ Req do
relax(v, w)
end
end
(b2) Add To Request:
Req ← {(w, d(v) + c(v, w)) : v ∈ S ∧ (v, w) ∈ heavy(v)}
(c2) Relax:
foreach (v, x) ∈ Req do
relax(v, x)
end
i ← i + 1
end
Procedure relax(v, x)
if x < d(v) then
B[d(v)/∆] ← B[d(v)/∆] v
B[x/∆] ← B[x/∆] ∪ v
d(v) ← x
§U?1#§¿1Ø$" §§mˇ&! flK·m·¶
"
u·§•^ReqI!:{§=#!:wc!:v" 3
Relax¥?1A?U=" ø—?kn1§!m" ’c
k2 × mmƒ§{ıInm£=!:Œ8⁄" 1§${E
,"ØuReqO\Iv·˜ÆuB[i]\\§{¢ydRelax⁄"§
~›§•^˜uReductionprefix sum [12]{OReqIndex§Ø
A!:3Req¥§ØReq#–¿11§dd)ߡ&!flK§
Jp¿1˙"
5.4 Relax
˜k§Øuz§tidreqSizeØ3Req¥ØA!:ØAI#!
:Œ"reqSizeaquJCompressed Sparse Row¥Cp¢Œ|"reqSize[i]P
„ci − 1!:Lo" 3#?1c(v, w)5(‰heavy‰light>§–?1
Aheavy‰lightª#" lightªmE,·O(l/t)§heavyO(h/t)§
¥lh'O·lightheavy#>Œ§t§oŒ"
5.5 BucketŒ(
3{¥§,’:·B[i]Œ(" {¥§Bucket8a." ø
(Ø•^uC++ STL¥seta§ØuCUDA?§K¿•" •3relaxF"
ØBucket?1O(1)m\! " duCUDAJłseta.§⁄–Oø
Œ(·~]§S! m! ˇ&flKU⁄·¶" ˜
uøflK§•Ø{?1U?–•^uCUDA?§" c[*B[i]–uy§
z!:v3mUÆuBucket§3B[i]B[j]„v„§
¥i = j" ,3¥ØuB = φB[i] = φ·OBucketI˜fl
K" 3•{¥§OBIndex(v)Œ^5v⁄ÆBucket¢" ø
—?k1§!S" BIndexInmŒ|=§Øu
{B[i]8a.⁄^m£max(p)/∆" ¥§p ∈ P §P L⁄kU
·»⁄!Nı" 1§B" Øu8B[i]«O·~E,
§Uˆ{y3O(1)mS⁄"ØuBIndex(v)Ø!\!–9
S¿UyO(1)"
5.6 ∆J
∆–kn«J1)∆ = M EDIAN (c(e))§2)∆ = 1
n
e∈E c(e)§3)∆ = max{c(e) :
e ∈ E}/max{d : d ∈ Degree}" §Meyer [17]y†re-insertionoŒ3|P∆|§re-
relaxationoŒ3|P2∆|" ¥§|P∆|LL·»∆8" Øu?
∆§phases Œ‰3dc/∆lmax§¥dc = max{d(v) : d(v) < ∞}" ⁄–§