Source code for ffc.uflacs.integralgenerator

# -*- coding: utf-8 -*-
# Copyright (C) 2011-2017 Martin Sandve Alnæs
#
# This file is part of UFLACS.
#
# UFLACS is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# UFLACS is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with UFLACS. If not, see <http://www.gnu.org/licenses/>

"""Controlling algorithm for building the tabulate_tensor source structure from factorized representation."""

from collections import defaultdict
import itertools

from ufl import product
from ufl.classes import ConstantValue, Condition
from ufl.measure import custom_integral_types, point_integral_types
from ufl.utils.indexflattening import shape_to_strides

from ffc.log import error, warning

from ffc.uflacs.build_uflacs_ir import get_common_block_data
from ffc.uflacs.elementtables import piecewise_ttypes
from ffc.uflacs.language.cnodes import pad_innermost_dim, pad_dim, MemZero, MemZeroRange


[docs]class IntegralGenerator(object): def __init__(self, ir, backend, precision): # Store ir self.ir = ir # Formatting precision self.precision = precision # Backend specific plugin with attributes # - language: for translating ufl operators to target language # - symbols: for translating ufl operators to target language # - definitions: for defining backend specific variables # - access: for accessing backend specific variables self.backend = backend # Set of operator names code has been generated for, # used in the end for selecting necessary includes self._ufl_names = set() # Initialize lookup tables for variable scopes self.init_scopes() # Cache of reusable blocks contributing to A self.shared_blocks = {} # Block contributions collected during generation to be added to A at the end self.finalization_blocks = defaultdict(list) # Set of counters used for assigning names to intermediate variables # TODO: Should this be part of the backend symbols? Doesn't really matter now. self.symbol_counters = defaultdict(int) # Whether pre-integration tables should be inlined - or added as static arrays self._inline_tables = None # Whether to unroll the tabulate_tensor assignments self._unroll_tables = None # The maximum size of any preintegration table in order to perform unrolling self.max_unroll_table_size = 1024
[docs] def get_includes(self): "Return list of include statements needed to support generated code." includes = set() # Get std::fill used by MemZero includes.add("#include <algorithm>") # For controlling floating point environment #includes.add("#include <cfenv>") # For intel intrinsics and controlling floating point environment #includes.add("#include <xmmintrin.h>") cmath_names = set(( "abs", "sign", "pow", "sqrt", "exp", "ln", "cos", "sin", "tan", "acos", "asin", "atan", "atan_2", "cosh", "sinh", "tanh", "acosh", "asinh", "atanh", "erf", "erfc", )) boost_math_names = set(( "bessel_j", "bessel_y", "bessel_i", "bessel_k", )) # Only return the necessary headers if cmath_names & self._ufl_names: includes.add("#include <cmath>") if boost_math_names & self._ufl_names: includes.add("#include <boost/math/special_functions.hpp>")
return sorted(includes)
[docs] def init_scopes(self): "Initialize variable scope dicts." # Reset variables, separate sets for quadrature loop self.scopes = { num_points: {} for num_points in self.ir["all_num_points"] }
self.scopes[None] = {}
[docs] def set_var(self, num_points, v, vaccess): """"Set a new variable in variable scope dicts. Scope is determined by num_points which identifies the quadrature loop scope or None if outside quadrature loops. v is the ufl expression and vaccess is the CNodes expression to access the value in the code. """
self.scopes[num_points][v] = vaccess
[docs] def has_var(self, num_points, v): """"Check if variable exists in variable scope dicts. Return True if ufl expression v exists in the num_points scope. NB! Does not fall back to piecewise scope. """
return v in self.scopes[num_points]
[docs] def get_var(self, num_points, v): """"Lookup ufl expression v in variable scope dicts. Scope is determined by num_points which identifies the quadrature loop scope or None if outside quadrature loops. If v is not found in quadrature loop scope, the piecewise scope (None) is checked. Returns the CNodes expression to access the value in the code. """ if v._ufl_is_literal_: return self.backend.ufl_to_language(v) f = self.scopes[num_points].get(v) if f is None: f = self.scopes[None][v]
return f
[docs] def new_temp_symbol(self, basename): "Create a new code symbol named basename + running counter." L = self.backend.language name = "%s%d" % (basename, self.symbol_counters[basename]) self.symbol_counters[basename] += 1
return L.Symbol(name)
[docs] def get_temp_symbol(self, tempname, key): key = (tempname,) + key s = self.shared_blocks.get(key) defined = s is not None if not defined: s = self.new_temp_symbol(tempname) self.shared_blocks[key] = s
return s, defined
[docs] def generate(self): """Generate entire tabulate_tensor body. Assumes that the code returned from here will be wrapped in a context that matches a suitable version of the UFC tabulate_tensor signatures. """ L = self.backend.language # Assert that scopes are empty: expecting this to be called only once assert not any(d for d in self.scopes.values()) parts = [] # TODO: Is this needed? Find a test case to check. parts += [ #L.VerbatimStatement("_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);"), #L.VerbatimStatement("std::fesetenv(FE_DFL_DISABLE_SSE_DENORMS_ENV);"), ] # Generate the tables of quadrature points and weights parts += self.generate_quadrature_tables() # Generate the tables of basis function values and preintegrated blocks parts += self.generate_element_tables() # Generate code to compute piecewise constant scalar factors parts += self.generate_unstructured_piecewise_partition() # Loop generation code will produce parts to go before quadloops, # to define the quadloops, and to go after the quadloops all_preparts = [] all_quadparts = [] all_postparts = [] # Go through each relevant quadrature loop if self.ir["integral_type"] in custom_integral_types: preparts, quadparts, postparts = \ self.generate_runtime_quadrature_loop() all_preparts += preparts all_quadparts += quadparts all_postparts += postparts else: for num_points in self.ir["all_num_points"]: # Generate code to integrate reusable blocks of final element tensor preparts, quadparts, postparts = \ self.generate_quadrature_loop(num_points) all_preparts += preparts all_quadparts += quadparts all_postparts += postparts # Generate code to finish computing reusable blocks outside quadloop preparts, quadparts, postparts = \ self.generate_dofblock_partition(None) all_preparts += preparts all_quadparts += quadparts all_postparts += postparts # Generate code to fill in A all_finalizeparts = [] # Generate code to set A = 0 #all_finalizeparts += self.generate_tensor_reset() # Generate code to compute piecewise constant scalar factors # and set A at corresponding nonzero components all_finalizeparts += self.generate_preintegrated_dofblock_partition() # Generate code to add reusable blocks B* to element tensor A all_finalizeparts += self.generate_copyout_statements() # Collect parts before, during, and after quadrature loops parts += all_preparts parts += all_quadparts parts += all_postparts parts += all_finalizeparts
return L.StatementList(parts)
[docs] def generate_quadrature_tables(self): "Generate static tables of quadrature points and weights." L = self.backend.language parts = [] # No quadrature tables for custom (given argument) # or point (evaluation in single vertex) skip = custom_integral_types + point_integral_types if self.ir["integral_type"] in skip: return parts alignas = self.ir["params"]["alignas"] padlen = self.ir["params"]["padlen"] # Loop over quadrature rules for num_points in self.ir["all_num_points"]: varying_ir = self.ir["varying_irs"][num_points] points, weights = self.ir["quadrature_rules"][num_points] assert num_points == len(weights) assert num_points == points.shape[0] # Generate quadrature weights array if varying_ir["need_weights"]: wsym = self.backend.symbols.weights_table(num_points) parts += [L.ArrayDecl("static const double", wsym, num_points, weights, alignas=alignas)] # Generate quadrature points array N = product(points.shape) if varying_ir["need_points"] and N: # Flatten array: (TODO: avoid flattening here, it makes padding harder) flattened_points = points.reshape(N) psym = self.backend.symbols.points_table(num_points) parts += [L.ArrayDecl("static const double", psym, N, flattened_points, alignas=alignas)] # Add leading comment if there are any tables parts = L.commented_code_list(parts, "Quadrature rules")
return parts
[docs] def generate_element_tables(self): """Generate static tables with precomputed element basis function values in quadrature points.""" L = self.backend.language parts = [] tables = self.ir["unique_tables"] table_types = self.ir["unique_table_types"] alignas = self.ir["params"]["alignas"] padlen = self.ir["params"]["padlen"] if self.ir["integral_type"] in custom_integral_types: # Define only piecewise tables table_names = [name for name in sorted(tables) if table_types[name] in piecewise_ttypes] else: # Define all tables table_names = sorted(tables) # Check whether it is ok to unroll all assignments involving the tables # TODO : Update the condition to be compatible with mixed-dimensional problems, # inline_tables with integral_type = cell should be false in that case # self._inline_tables = self.ir["integral_type"] == "cell" self._inline_tables = False self._unroll_tables = True for name in table_names: table = tables[name] if table.size > self.max_unroll_table_size: self._inline_tables = False self._unroll_tables = False break for name in table_names: table = tables[name] # Don't pad preintegrated tables if name[0] == "P": p = 1 else: p = padlen # Skip tables that are inlined in code generation if self._inline_tables and name[:2] == "PI": continue decl = L.ArrayDecl("static const double", name, table.shape, table, alignas=alignas, padlen=p) parts += [decl] # Add leading comment if there are any tables parts = L.commented_code_list(parts, [ "Precomputed values of basis functions and precomputations", "FE* dimensions: [entities][points][dofs]", "PI* dimensions: [entities][dofs][dofs] or [entities][dofs]", "PM* dimensions: [entities][dofs][dofs]", ])
return parts
[docs] def generate_tensor_reset(self): "Generate statements for resetting the element tensor to zero." L = self.backend.language A = self.backend.symbols.element_tensor() A_shape = self.ir["tensor_shape"] A_size = product(A_shape) parts = [ L.Comment("Reset element tensor"), L.MemZero(A, A_size), ]
return parts
[docs] def generate_quadrature_loop(self, num_points): "Generate quadrature loop with for this num_points." L = self.backend.language # Generate unstructured varying partition body = self.generate_unstructured_varying_partition(num_points) body = L.commented_code_list(body, "Quadrature loop body setup (num_points={0})".format(num_points)) # Generate dofblock parts, some of this # will be placed before or after quadloop preparts, quadparts, postparts = \ self.generate_dofblock_partition(num_points) body += quadparts # Wrap body in loop or scope if not body: # Could happen for integral with everything zero and optimized away quadparts = [] elif num_points == 1: # For now wrapping body in Scope to avoid thinking about scoping issues quadparts = L.commented_code_list(L.Scope(body), "Only 1 quadrature point, no loop") else: # Regular case: define quadrature loop if num_points == 1: iq = 0 else: iq = self.backend.symbols.quadrature_loop_index() quadparts = [L.ForRange(iq, 0, num_points, body=body)]
return preparts, quadparts, postparts
[docs] def generate_runtime_quadrature_loop(self): "Generate quadrature loop for custom integrals, with physical points given runtime." L = self.backend.language assert self.ir["integral_type"] in custom_integral_types num_points = self.ir["fake_num_points"] chunk_size = self.ir["params"]["chunk_size"] gdim = self.ir["geometric_dimension"] tdim = self.ir["topological_dimension"] alignas = self.ir["params"]["alignas"] #padlen = self.ir["params"]["padlen"] tables = self.ir["unique_tables"] table_types = self.ir["unique_table_types"] #table_origins = self.ir["unique_table_origins"] # FIXME # Generate unstructured varying partition body = self.generate_unstructured_varying_partition(num_points) body = L.commented_code_list(body, ["Run-time quadrature loop body setup", "(chunk_size={0}, analysis_num_points={1})".format( chunk_size, num_points)]) # Generate dofblock parts, some of this # will be placed before or after quadloop preparts, quadparts, postparts = \ self.generate_dofblock_partition(num_points) body += quadparts # Wrap body in loop if not body: # Could happen for integral with everything zero and optimized away quadparts = [] else: rule_parts = [] # Define two-level quadrature loop; over chunks then over points in chunk iq_chunk = L.Symbol("iq_chunk") np = self.backend.symbols.num_custom_quadrature_points() num_point_blocks = (np + chunk_size - 1) / chunk_size iq = self.backend.symbols.quadrature_loop_index() # Not assuming runtime size to be multiple by chunk size num_points_in_block = L.Symbol("num_points_in_chunk") decl = L.VariableDecl("const int", num_points_in_block, L.Call("min", (chunk_size, np - iq_chunk * chunk_size))) rule_parts.append(decl) iq_body = L.ForRange(iq, 0, num_points_in_block, body=body) ### Preparations for quadrature rules # varying_ir = self.ir["varying_irs"][num_points] # Copy quadrature weights for this chunk if varying_ir["need_weights"]: cwsym = self.backend.symbols.custom_quadrature_weights() wsym = self.backend.symbols.custom_weights_table() rule_parts += [ L.ArrayDecl("double", wsym, chunk_size, 0, alignas=alignas), L.ForRange(iq, 0, num_points_in_block, body=L.Assign(wsym[iq], cwsym[chunk_size*iq_chunk + iq])), ] # Copy quadrature points for this chunk if varying_ir["need_points"]: cpsym = self.backend.symbols.custom_quadrature_points() psym = self.backend.symbols.custom_points_table() rule_parts += [ L.ArrayDecl("double", psym, chunk_size * gdim, 0, alignas=alignas), L.ForRange(iq, 0, num_points_in_block, body=[L.Assign(psym[iq*gdim + i], cpsym[chunk_size*iq_chunk*gdim + iq*gdim + i]) for i in range(gdim)]) ] # Add leading comment if there are any tables rule_parts = L.commented_code_list(rule_parts, "Quadrature weights and points") ### Preparations for element tables table_parts = [] # Only declare non-piecewise tables, computed inside chunk loop non_piecewise_tables = [name for name in sorted(tables) if table_types[name] not in piecewise_ttypes] for name in non_piecewise_tables: table = tables[name] decl = L.ArrayDecl("double", name, (1, chunk_size, table.shape[2]), 0, alignas=alignas) # padlen=padlen) table_parts += [decl] table_parts += [L.Comment("FIXME: Fill element tables here")] #table_origins ### Gather all in chunk loop chunk_body = rule_parts + table_parts + [iq_body] quadparts = [L.ForRange(iq_chunk, 0, num_point_blocks, body=chunk_body)]
return preparts, quadparts, postparts
[docs] def generate_unstructured_piecewise_partition(self): L = self.backend.language num_points = None expr_ir = self.ir["piecewise_ir"] name = "sp" arraysymbol = L.Symbol(name) parts = self.generate_partition(arraysymbol, expr_ir["V"], expr_ir["V_active"], expr_ir["V_mts"], expr_ir["mt_tabledata"], num_points) parts = L.commented_code_list(parts, "Unstructured piecewise computations")
return parts
[docs] def generate_unstructured_varying_partition(self, num_points): L = self.backend.language expr_ir = self.ir["varying_irs"][num_points] name = "sv" arraysymbol = L.Symbol("%s%d" % (name, num_points)) parts = self.generate_partition(arraysymbol, expr_ir["V"], expr_ir["V_varying"], expr_ir["V_mts"], expr_ir["mt_tabledata"], num_points) parts = L.commented_code_list(parts, "Unstructured varying computations for num_points=%d" % (num_points,))
return parts
[docs] def generate_partition(self, symbol, V, V_active, V_mts, mt_tabledata, num_points): L = self.backend.language definitions = [] intermediates = [] active_indices = [i for i, p in enumerate(V_active) if p] for i in active_indices: v = V[i] mt = V_mts[i] if v._ufl_is_literal_: vaccess = self.backend.ufl_to_language(v) elif mt is not None: # All finite element based terminals has table data, as well # as some but not all of the symbolic geometric terminals tabledata = mt_tabledata.get(mt) # Backend specific modified terminal translation vaccess = self.backend.access(mt.terminal, mt, tabledata, num_points) vdef = self.backend.definitions(mt.terminal, mt, tabledata, num_points, vaccess) # Store definitions of terminals in list assert isinstance(vdef, list) definitions.extend(vdef) else: # Get previously visited operands vops = [self.get_var(num_points, op) for op in v.ufl_operands] # Mapping UFL operator to target language self._ufl_names.add(v._ufl_handler_name_) vexpr = self.backend.ufl_to_language(v, *vops) # TODO: Let optimized ir provide mapping of vertex indices to # variable indices, marking which subexpressions to store in variables # and in what order: #j = variable_id[i] # Currently instead creating a new intermediate for # each subexpression except boolean conditions if isinstance(v, Condition): # Inline the conditions x < y, condition values # 'x' and 'y' may still be stored in intermediates. # This removes the need to handle boolean intermediate variables. # With tensor-valued conditionals it may not be optimal but we # let the C++ compiler take responsibility for optimizing those cases. j = None elif any(op._ufl_is_literal_ for op in v.ufl_operands): # Skip intermediates for e.g. -2.0*x, # resulting in lines like z = y + -2.0*x j = None else: j = len(intermediates) if j is not None: # Record assignment of vexpr to intermediate variable if self.ir["params"]["use_symbol_array"]: vaccess = symbol[j] intermediates.append(L.Assign(vaccess, vexpr)) else: vaccess = L.Symbol("%s_%d" % (symbol.name, j)) intermediates.append(L.VariableDecl("const double", vaccess, vexpr)) else: # Access the inlined expression vaccess = vexpr # Store access node for future reference self.set_var(num_points, v, vaccess) # Join terminal computation, array of intermediate expressions, # and intermediate computations parts = [] if definitions: parts += definitions if intermediates: if self.ir["params"]["use_symbol_array"]: alignas = self.ir["params"]["alignas"] parts += [L.ArrayDecl("double", symbol, len(intermediates), alignas=alignas)] parts += intermediates
return parts
[docs] def generate_dofblock_partition(self, num_points): L = self.backend.language if num_points is None: # NB! None meaning piecewise partition, not custom integral block_contributions = self.ir["piecewise_ir"]["block_contributions"] else: block_contributions = self.ir["varying_irs"][num_points]["block_contributions"] preparts = [] quadparts = [] postparts = [] blocks = [(blockmap, blockdata) for blockmap, contributions in sorted(block_contributions.items()) for blockdata in contributions if blockdata.block_mode != "preintegrated"] for blockmap, blockdata in blocks: # Get symbol for already defined block B if it exists common_block_data = get_common_block_data(blockdata) B = self.shared_blocks.get(common_block_data) if B is None: # Define code for block depending on mode B, block_preparts, block_quadparts, block_postparts = \ self.generate_block_parts(num_points, blockmap, blockdata) # Add definitions preparts.extend(block_preparts) # Add computations quadparts.extend(block_quadparts) # Add finalization postparts.extend(block_postparts) # Store reference for reuse self.shared_blocks[common_block_data] = B # Add A[blockmap] += B[...] to finalization self.finalization_blocks[blockmap].append(B)
return preparts, quadparts, postparts
[docs] def get_entities(self, blockdata): L = self.backend.language if self.ir["integral_type"] == "interior_facet": # Get the facet entities entities = [] for r in blockdata.restrictions: if r is None: entities.append(0) else: entities.append(self.backend.symbols.entity(self.ir["entitytype"], r)) if blockdata.transposed: return (entities[1], entities[0]) else: return tuple(entities) else: # Get the current cell or facet entity if blockdata.is_uniform: # uniform, i.e. constant across facets entity = L.LiteralInt(0) else: entity = self.backend.symbols.entity(self.ir["entitytype"], None)
return (entity,)
[docs] def get_arg_factors(self, blockdata, block_rank, num_points, iq, indices): L = self.backend.language arg_factors = [] for i in range(block_rank): mad = blockdata.ma_data[i] td = mad.tabledata if td.is_piecewise: scope = self.ir["piecewise_ir"]["modified_arguments"] else: scope = self.ir["varying_irs"][num_points]["modified_arguments"] mt = scope[mad.ma_index] # Translate modified terminal to code # TODO: Move element table access out of backend? # Not using self.backend.access.argument() here # now because it assumes too much about indices. table = self.backend.symbols.element_table(td, self.ir["entitytype"], mt.restriction) assert td.ttype != "zeros" if td.ttype == "ones": arg_factor = L.LiteralFloat(1.0) elif td.ttype == "quadrature": # TODO: Revisit all quadrature ttype checks arg_factor = table[iq] else: # Assuming B sparsity follows element table sparsity arg_factor = table[indices[i]] arg_factors.append(arg_factor)
return arg_factors
[docs] def generate_block_parts(self, num_points, blockmap, blockdata): """Generate and return code parts for a given block. Returns parts occuring before, inside, and after the quadrature loop identified by num_points. Should be called with num_points=None for quadloop-independent blocks. """ L = self.backend.language # The parts to return preparts = [] quadparts = [] postparts = [] # TODO: Define names in backend symbols? #tempnames = self.backend.symbols.block_temp_names #blocknames = self.backend.symbols.block_names tempnames = { #"preintegrated": "TI", "premultiplied": "TM", "partial": "TP", "full": "TF", "safe": "TS", "quadrature": "TQ", } blocknames = { #"preintegrated": "BI", #"premultiplied": "BM", #"partial": "BP", "full": "BF", "safe": "BS", "quadrature": "BQ", } fwtempname = "fw" tempname = tempnames.get(blockdata.block_mode) alignas = self.ir["params"]["alignas"] padlen = self.ir["params"]["padlen"] block_rank = len(blockmap) blockdims = tuple(len(dofmap) for dofmap in blockmap) padded_blockdims = pad_innermost_dim(blockdims, padlen) ttypes = blockdata.ttypes if "zeros" in ttypes: error("Not expecting zero arguments to be left in dofblock generation.") if num_points is None: iq = None elif num_points == 1: iq = 0 else: iq = self.backend.symbols.quadrature_loop_index() # Override dof index with quadrature loop index for arguments with # quadrature element, to index B like B[iq*num_dofs + iq] arg_indices = tuple(self.backend.symbols.argument_loop_index(i) for i in range(block_rank)) B_indices = [] for i in range(block_rank): if ttypes[i] == "quadrature": B_indices.append(iq) else: B_indices.append(arg_indices[i]) B_indices = tuple(B_indices) # Define unique block symbol blockname = blocknames.get(blockdata.block_mode) if blockname: B = self.new_temp_symbol(blockname) # Add initialization of this block to parts # For all modes, block definition occurs before quadloop preparts.append(L.ArrayDecl("double", B, blockdims, 0, alignas=alignas, padlen=padlen)) # Get factor expression if blockdata.factor_is_piecewise: v = self.ir["piecewise_ir"]["V"][blockdata.factor_index] else: v = self.ir["varying_irs"][num_points]["V"][blockdata.factor_index] f = self.get_var(num_points, v) # Quadrature weight was removed in representation, add it back now if num_points is None: weight = L.LiteralFloat(1.0) elif self.ir["integral_type"] in custom_integral_types: weights = self.backend.symbols.custom_weights_table() weight = weights[iq] else: weights = self.backend.symbols.weights_table(num_points) weight = weights[iq] # Define fw = f * weight if blockdata.block_mode in ("safe", "full", "partial"): assert not blockdata.transposed, "Not handled yet" # Fetch code to access modified arguments arg_factors = self.get_arg_factors(blockdata, block_rank, num_points, iq, B_indices) fw_rhs = L.float_product([f, weight]) if not isinstance(fw_rhs, L.Product): fw = fw_rhs else: # Define and cache scalar temp variable key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise) fw, defined = self.get_temp_symbol(fwtempname, key) if not defined: quadparts.append(L.VariableDecl("const double", fw, fw_rhs)) # Plan for vectorization of fw computations over iq: # 1) Define fw as arrays e.g. "double fw0[nq];" outside quadloop # 2) Access as fw0[iq] of course # 3) Split quadrature loops, one for fw computation and one for blocks # 4) Pad quadrature rule with 0 weights and last point # Plan for vectorization of coefficient evaluation over iq: # 1) Define w0_c1 etc as arrays e.g. "double w0_c1[nq] = {};" outside quadloop # 2) Access as w0_c1[iq] of course # 3) Splitquadrature loops, coefficients before fw computation # 4) Possibly swap loops over iq and ic: # for(ic) for(iq) w0_c1[iq] = w[0][ic] * FE[iq][ic]; if blockdata.block_mode == "safe": # Naively accumulate integrand for this block in the innermost loop assert not blockdata.transposed B_rhs = L.float_product([fw] + arg_factors) body = L.AssignAdd(B[B_indices], B_rhs) # NB! += not = for i in reversed(range(block_rank)): body = L.ForRange(B_indices[i], 0, padded_blockdims[i], body=body) quadparts += [body] # Define rhs expression for A[blockmap[arg_indices]] += A_rhs A_rhs = B[arg_indices] elif blockdata.block_mode == "full": assert not blockdata.transposed, "Not handled yet" if block_rank < 2: # Multiply collected factors B_rhs = L.float_product([fw] + arg_factors) else: # TODO: Pick arg with smallest dimension, or pick # based on global optimization to reuse more blocks i = 0 # Index selected for precomputation j = 1 - i P_index = B_indices[i] key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise, arg_factors[i].ce_format(self.precision)) P, defined = self.get_temp_symbol(tempname, key) if not defined: # TODO: If FE table is varying and only used in contexts # where it's multiplied by weight, we can premultiply it! # Then this would become P = f * preweighted_FE_table[:]. # Define and compute intermediate value # P[:] = (weight * f) * args[i][:] # inside quadrature loop P_dim = blockdims[i] quadparts.append(L.ArrayDecl("double", P, P_dim, None, alignas=alignas, padlen=padlen)) P_rhs = L.float_product([fw, arg_factors[i]]) body = L.Assign(P[P_index], P_rhs) #if ttypes[i] != "quadrature": # FIXME: What does this mean here? vectorize = self.ir["params"]["vectorize"] body = L.ForRange(P_index, 0, P_dim, body=body, vectorize=vectorize) quadparts.append(body) B_rhs = P[P_index] * arg_factors[j] # Add result to block inside quadloop body = L.AssignAdd(B[B_indices], B_rhs) # NB! += not = for i in reversed(range(block_rank)): # Vectorize only the innermost loop vectorize = self.ir["params"]["vectorize"] and (i == block_rank - 1) if ttypes[i] != "quadrature": body = L.ForRange(B_indices[i], 0, padded_blockdims[i], body=body, vectorize=vectorize) quadparts += [body] # Define rhs expression for A[blockmap[arg_indices]] += A_rhs A_rhs = B[arg_indices] elif blockdata.block_mode == "partial": # TODO: To handle transpose here, must add back intermediate block B assert not blockdata.transposed, "Not handled yet" # Get indices and dimensions right here... assert block_rank == 2 i = blockdata.piecewise_ma_index not_piecewise_index = 1 - i P_index = arg_indices[not_piecewise_index] key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise, arg_factors[not_piecewise_index].ce_format(self.precision)) P, defined = self.get_temp_symbol(tempname, key) if not defined: # Declare P table in preparts P_dim = blockdims[not_piecewise_index] preparts.append(L.ArrayDecl("double", P, P_dim, 0, alignas=alignas, padlen=padlen)) # Multiply collected factors P_rhs = L.float_product([fw, arg_factors[not_piecewise_index]]) # Accumulate P += weight * f * args in quadrature loop body = L.AssignAdd(P[P_index], P_rhs) body = L.ForRange(P_index, 0, pad_dim(P_dim, padlen), body=body) quadparts.append(body) # Define B = B_rhs = piecewise_argument[:] * P[:], where P[:] = sum_q weight * f * other_argument[:] B_rhs = arg_factors[i] * P[P_index] # Define rhs expression for A[blockmap[arg_indices]] += A_rhs A_rhs = B_rhs elif blockdata.block_mode in ("premultiplied", "preintegrated"): P_entity_indices = self.get_entities(blockdata) if blockdata.transposed: P_block_indices = (arg_indices[1], arg_indices[0]) else: P_block_indices = arg_indices P_ii = P_entity_indices + P_block_indices if blockdata.block_mode == "preintegrated": # Preintegrated should never get into quadloops assert num_points is None # Define B = B_rhs = f * PI where PI = sum_q weight * u * v PI = L.Symbol(blockdata.name)[P_ii] B_rhs = L.float_product([f, PI]) elif blockdata.block_mode == "premultiplied": key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise) FI, defined = self.get_temp_symbol(tempname, key) if not defined: # Declare FI = 0 before quadloop preparts += [L.VariableDecl("double", FI, 0)] # Accumulate FI += weight * f in quadparts quadparts += [L.AssignAdd(FI, L.float_product([weight, f]))] # Define B_rhs = FI * PM where FI = sum_q weight*f, and PM = u * v PM = L.Symbol(blockdata.name)[P_ii] B_rhs = L.float_product([FI, PM]) # Define rhs expression for A[blockmap[arg_indices]] += A_rhs A_rhs = B_rhs
return A_rhs, preparts, quadparts, postparts
[docs] def generate_preintegrated_dofblock_partition(self): # FIXME: Generalize this to unrolling all A[] += ... loops, or all loops with noncontiguous DM?? L = self.backend.language block_contributions = self.ir["piecewise_ir"]["block_contributions"] blocks = [(blockmap, blockdata) for blockmap, contributions in sorted(block_contributions.items()) for blockdata in contributions if blockdata.block_mode == "preintegrated"] # Get symbol, dimensions, and loop index symbols for A A = self.backend.symbols.element_tensor() A_shape = self.ir["tensor_shape"] A_size = product(A_shape) A_strides = shape_to_strides(A_shape) # List for unrolled assignments A_values = [0.0] * A_size # List of statements when not unrolling parts = [] for block_id, (blockmap, blockdata) in enumerate(blocks): # Accumulate A[blockmap[...]] += f*PI[...] # Get table for inlining tables = self.ir["unique_tables"] table = tables[blockdata.name] # TODO : Update the condition to be compatible with mixed-dimensional problems, # inline_tables with integral_type = cell should be false in that case # inline_table = self.ir["integral_type"] == "cell" self._inline_table = False # Get factor expression v = self.ir["piecewise_ir"]["V"][blockdata.factor_index] f = self.get_var(None, v) # Define rhs expression for A[blockmap[arg_indices]] += A_rhs # A_rhs = f * PI where PI = sum_q weight * u * v PI = L.Symbol(blockdata.name) block_rank = len(blockmap) # Indices used to index A A_index_symbols = tuple(self.backend.symbols.argument_loop_index(i) for i in range(block_rank)) # Indices used to index PI P_index_symbols = tuple(self.backend.symbols.argument_loop_index(i) for i in range(block_rank, 2*block_rank)) # Define indices into preintegrated block P_entity_indices = self.get_entities(blockdata) if self._inline_tables: assert P_entity_indices == (L.LiteralInt(0),) assert table.shape[0] == 1 # Unroll loop blockshape = [len(DM) for DM in blockmap] blockrange = [range(d) for d in blockshape] if self._unroll_tables: # Generate unrolled assignments for the current block for ii in itertools.product(*blockrange): A_ii = sum(A_strides[i] * blockmap[i][ii[i]] for i in range(len(ii))) if blockdata.transposed: P_arg_indices = (ii[1], ii[0]) else: P_arg_indices = ii if self._inline_tables: # Extract float value of PI[P_ii] Pval = table[0] # always entity 0 for i in P_arg_indices: Pval = Pval[i] A_rhs = Pval * f else: # Index the static preintegrated table: P_ii = P_entity_indices + P_arg_indices A_rhs = f * PI[P_ii] A_values[A_ii] = A_values[A_ii] + A_rhs else: # Don't unroll, generate assignment loops # TODO: Allow mixed unrolled/non unrolled assignments? # Make sure that we can use PI as static array assert not self._inline_tables # Accumulate index expression 'A_idx' used as 'A[A_idx]' in the loop, e.g. A_idx = i*3 + j A_idx = sum(A_strides[i] * index_symbol for i, index_symbol in enumerate(A_index_symbols)) # Get the index tuple used to index the pre-integrated table P_arg_indices = P_index_symbols # Transpose the PI indices, if block is transposed if blockdata.transposed: P_arg_indices = tuple(reversed(P_arg_indices)) # Initialize the central assignment statement P_ii = P_entity_indices + P_arg_indices A_rhs = f * PI[P_ii] loop_code = L.AssignAdd(A[A_idx], A_rhs) # Construct nested loops, starting with inner-most dimension for sub_blockmap, A_arg_index, P_arg_index in zip(reversed(blockmap), reversed(A_index_symbols), reversed(P_index_symbols)): # Append increment of PI index to inner code loop_code = [loop_code, L.PreIncrement(P_arg_index)] # Check whether the current bock map is a contiguous range if (sub_blockmap[-1] - sub_blockmap[0] + 1) == len(sub_blockmap): # We have contiguous indices in the blockmap, a single loop is enough loop_code = [L.ForRange(A_arg_index, sub_blockmap[0], sub_blockmap[-1] + 1, loop_code)] else: # Split blockmap into contiguous sub-ranges -> multiple loops loop_block = [] # The groupby statement yields these contiguous sub-ranges for k, group in itertools.groupby(enumerate(sub_blockmap), key=lambda x: x[0] - x[1]): contiguous_range = list(group) loop_interval = contiguous_range[0][1], contiguous_range[-1][1] + 1 # Check whether we really need a loop if loop_interval[1] - loop_interval[0] > 1: loop_block += [L.ForRange(A_arg_index, loop_interval[0], loop_interval[1], loop_code)] else: # TODO: Remove scope by explicitly replacing Symbol A_arg_index with Literal loop_interval[0] in all subexpression loop_block += [L.Scope([ L.VariableDecl("int", A_arg_index, loop_interval[0]), loop_code] )] loop_code = loop_block # Reset the currently inner-most PI table index loop_code += [L.Assign(P_arg_index, 0)] # Define the P index variables before the loop code loop_code = [L.VariableDecl("int", P_arg_index, 0) for P_arg_index in P_arg_indices] + loop_code # Add a comment and scope to keep index variables enclosed parts += [L.Scope(loop_code)] if self._unroll_tables: parts = self.generate_tensor_value_initialization(A_values) else: # If we have assignment loops, zero all entries of A first parts = [L.MemZero(A, A_size)] + parts
return parts
[docs] def generate_tensor_value_initialization(self, A_values): parts = [] L = self.backend.language A = self.backend.symbols.element_tensor() A_size = len(A_values) init_mode = self.ir["params"]["tensor_init_mode"] z = L.LiteralFloat(0.0) if init_mode == "direct": # Generate A[i] = A_values[i] including zeros for i in range(A_size): parts += [L.Assign(A[i], A_values[i])] elif init_mode == "upfront": # Zero everything first parts += [L.MemZero(A, A_size)] # Generate A[i] = A_values[i] skipping zeros for i in range(A_size): if not (A_values[i] == 0.0 or A_values[i] == z): parts += [L.Assign(A[i], A_values[i])] elif init_mode == "interleaved": # Generate A[i] = A_values[i] with interleaved zero filling i = 0 zero_begin = 0 zero_end = zero_begin while i < A_size: if A_values[i] == 0.0 or A_values[i] == z: # Update range of A zeros zero_end = i + 1 else: # Set zeros of A just prior to A[i] if zero_end == zero_begin + 1: parts += [L.Assign(A[zero_begin], 0.0)] elif zero_end > zero_begin: parts += [L.MemZeroRange(A, zero_begin, zero_end)] zero_begin = i + 1 zero_end = zero_begin # Set A[i] value parts += [L.Assign(A[i], A_values[i])] i += 1 if zero_end == zero_begin + 1: parts += [L.Assign(A[zero_begin], 0.0)] elif zero_end > zero_begin: parts += [L.MemZeroRange(A, zero_begin, zero_end)] else: error("Invalid init_mode parameter %s" % (init_mode,))
return parts
[docs] def generate_expr_copyout_statements(self): L = self.backend.language parts = [] # Not expecting any quadrature loop scopes here assert tuple(self.scopes.keys()) == (None,) # TODO: Get symbol from backend values = L.Symbol("values") # TODO: Allow expression compilation to compute multiple points at once! # Similarities to custom integrals in that points are given, # while different in output format: results are not accumulated # for each point but stored in output array instead. # Assign computed results to output variables pir = self.ir["piecewise_ir"] V = pir["V"] V_targets = pir["V_targets"] for i, fi in enumerate(V_targets): parts.append(L.Assign(values[i], self.get_var(None, V[fi])))
return parts
[docs] def generate_tensor_copyout_statements(self): L = self.backend.language parts = [] # Get symbol, dimensions, and loop index symbols for A A_shape = self.ir["tensor_shape"] A_size = product(A_shape) A_rank = len(A_shape) A_strides = [1]*A_rank # TODO: there's something like shape2strides(A_shape) somewhere for i in reversed(range(0, A_rank-1)): A_strides[i] = A_strides[i+1] * A_shape[i+1] Asym = self.backend.symbols.element_tensor() A = L.FlattenedArray(Asym, dims=A_shape) indices = [self.backend.symbols.argument_loop_index(i) for i in range(A_rank)] dofmap_parts = [] dofmaps = {} for blockmap, contributions in sorted(self.finalization_blocks.items()): # Define mapping from B indices to A indices A_indices = [] for i in range(A_rank): dofmap = blockmap[i] begin = dofmap[0] end = dofmap[-1] + 1 if len(dofmap) == end - begin: # Dense insertion, offset B index to index A j = indices[i] + begin else: # Sparse insertion, map B index through dofmap DM = dofmaps.get(dofmap) if DM is None: DM = L.Symbol("DM%d" % len(dofmaps)) dofmaps[dofmap] = DM dofmap_parts.append(L.ArrayDecl("static const int", DM, len(dofmap), dofmap)) j = DM[indices[i]] A_indices.append(j) A_indices = tuple(A_indices) # Sum up all blocks contributing to this blockmap term = L.Sum([B_rhs for B_rhs in contributions]) # TODO: need ttypes associated with this block to deal # with loop dropping for quadrature elements: ttypes = () if ttypes == ("quadrature", "quadrature"): debug("quadrature element block insertion not optimized") # Add components of all B's to A component in loop nest body = L.AssignAdd(A[A_indices], term) for i in reversed(range(A_rank)): body = L.ForRange(indices[i], 0, len(blockmap[i]), body=body) # Add this block to parts parts.append(body) # Place static dofmap tables first parts = dofmap_parts + parts
return parts
[docs] def generate_copyout_statements(self): """Generate statements copying results to output array.""" if self.ir["integral_type"] == "expression": return self.generate_expr_copyout_statements() #elif self.ir["unroll_copyout_loops"]: # return self.generate_unrolled_tensor_copyout_statements() else:
return self.generate_tensor_copyout_statements()