-
Notifications
You must be signed in to change notification settings - Fork 0
/
executionModel.tex
185 lines (168 loc) · 6.22 KB
/
executionModel.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
\begin{frame}[t]
\frametitle{The Execution Model}
\begin{itemize}
\item Several objects live on a single ``processor''
\begin{itemize}
\item We will come back to what we mean by a processor.
\begin{itemize}
\item For now, think of it as a core
\end{itemize}
\end{itemize}
\pause
\item As a result,
\begin{itemize}
\item Method invocations directed at objects on that processor will have to be stored in a pool
\pause
\item And a user-level scheduler will select one invocation from the queue and runs it to completion
\end{itemize}
\end{itemize}
\begin{center} \includegraphics[width=0.7\textwidth]{figures/scheduler} \end{center}
\end{frame}
\begin{frame}[t]
\frametitle{Message-driven Execution}
\begin{itemize}
\item Execution is triggered by availability of a ``message'' (a method invocation)
\pause
\item When an entry method executes,
\begin{itemize}
\item it may generate messages for other objects
\item the RTS deposits them in the message Q on the target processor
\end{itemize}
\end{itemize}
\begin{center} \includegraphics[width=0.7\textwidth]{figures/scheduler} \end{center}
\end{frame}
\begin{frame}[t]
\frametitle{Utility for Multi-cores, Many-cores, Accelerators}
\begin{itemize}
\item Objects connote and promote locality
\item Message-driven execution is
\begin{itemize}
\item A strong principle of prediction for data and code use
\item Much stronger than principle of locality
\begin{itemize}
\item Can be used to scale memory wall
\item Prefetching of needed data, e.g, into scratch pad memories
\end{itemize}
\end{itemize}
\end{itemize}
\begin{center} \includegraphics[width=0.7\textwidth]{figures/scheduler} \end{center}
\end{frame}
\begin{frame}[t]
\frametitle{Impact on communication}
\begin{itemize}
\item Current use of communication network
\begin{itemize}
\item Compute-communicate cycles in typical MPI apps
\item Network is used for a fraction of time
\item And is on the critical path
\end{itemize}
\pause
\item Hence, current communication networks are over-engineered by necessity
\pause
\item With overdecomposition
\begin{itemize}
\item Communication is spread over an iteration
\item Adaptive overlap of communication and computation
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[t]
\frametitle{Example: Stencil Computation}
\begin{itemize}
\item Consider a simple stencil computation
\begin{itemize}
\item With traditional design based on traditional methods (e.g. MPI-based)
\begin{itemize}
\item Each processor has a chunk, which alternates between computing and communicating
\end{itemize}
\pause
\item With Charm++
\begin{itemize}
\item Multiple chunks on each processor
\item Wait time for each chunk overlapped with useful computation for others
\item Communication spread over time
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[t]
\frametitle{Example: Stencil Computation}
\begin{center} \includegraphics[width=\textwidth]{figures/stencil_timeline} \end{center}
\begin{center} Stencil in MPI: No overlap among computation and
communication\end{center}
\pause
\begin{center} \includegraphics[width=\textwidth]{figures/stencil_timeline2} \end{center}
\begin{center} Stencil in Charm: Communication of a chare overlaps with
computation of others \end{center}
\end{frame}
\begin{frame}[t]
\frametitle{Example: Multiple Modules}
Without message-driven execution (and virtualization), you get either: Space-division
\begin{center} \includegraphics[width=\textwidth]{figures/stencil_space} \end{center}
\end{frame}
\begin{frame}[t]
\frametitle{Example: Multiple Modules}
Sequentialization
\begin{center} \includegraphics[width=\textwidth]{figures/stencil_seq} \end{center}
\end{frame}
\begin{frame}[t]
\frametitle{Example: Multiple Modules}
Parallel Composition: A1; (B || C ); A2
\begin{center} \includegraphics[width=\textwidth]{figures/stencil_charm} \end{center}
Recall: Different modules, written in different languages/paradigms, can overlap
in time and on processors, without programmer having to worry about this
explicitly
\end{frame}
%this is covered in design examples case studies section
%\begin{frame}[t]
%\frametitle{MD Parallelization Using Charm++}
%The computation is decomposed into ``natural'' objects of the application, which
%are assigned to processors by Charm++ RTS
% \begin{center} \includegraphics[width=\textwidth]{figures/md_parallelize.pdf}% \end{center}
%\end{frame}
\begin{frame}[t]
\frametitle{Migratability}
\begin{itemize}
\item Once the programmer has written the code without reference to
processors, all of the communication is expressed among objects
\item The system is free to migrate the objects across processors as and when it pleases
\begin{itemize}
\item It must ensure it can deliver method invocations to the objects, whereever they go
\item This migratability turns out to be a key attribute for empowering an adaptive runtime system
\end{itemize}
\end{itemize}
\end{frame}
% TODO: fill in these slides
%\begin{frame}[t]
% \frametitle{Load Balancing}
%\end{frame}
%\begin{frame}[t]
% \frametitle{Automatic Overlap of Communication and Computation}
%\end{frame}
%\begin{frame}[t]
% \frametitle{Fault Tolerance}
%\end{frame}
\begin{frame}[t]
\frametitle{Decomposition Independent of numCores}
\begin{columns}
\column{.7\textwidth}
\begin{itemize}
\item Rocket simulation under traditional MPI
\end{itemize}
\begin{center} \includegraphics[width=.6\textwidth]{figures/rocket_mpi} \end{center}
\pause
\begin{itemize}
\item Rocket simulation with migratable objects
\begin{center} \includegraphics[width=.6\textwidth]{figures/rocket_charm} \end{center}
\begin{itemize}
\item Benefits: load balance, communication optimizations, modularity
\end{itemize}
\end{itemize}
\onslide<1-> {
\column{.3\textwidth}
\vfill
\begin{center} \includegraphics<0->[width=\textwidth]{figures/rocket.png} \end{center}
\vfill
}
\end{columns}
\end{frame}