Source code for ffc.uflacs.integralgenerator

# -*- coding: utf-8 -*-
# Copyright (C) 2011-2017 Martin Sandve Alnæs
#
# This file is part of UFLACS.
#
# UFLACS is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# UFLACS is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with UFLACS. If not, see <http://www.gnu.org/licenses/>

"""Controlling algorithm for building the tabulate_tensor source structure from factorized representation."""

from collections import defaultdict
import itertools

from ufl import product
from ufl.classes import ConstantValue, Condition
from ufl.measure import custom_integral_types, point_integral_types
from ufl.utils.indexflattening import shape_to_strides

from ffc.log import error, warning

from ffc.uflacs.build_uflacs_ir import get_common_block_data
from ffc.uflacs.elementtables import piecewise_ttypes
from ffc.uflacs.language.cnodes import pad_innermost_dim, pad_dim, MemZero, MemZeroRange


[docs]class IntegralGenerator(object):

    def __init__(self, ir, backend, precision):
        # Store ir
        self.ir = ir

        # Formatting precision
        self.precision = precision

        # Backend specific plugin with attributes
        # - language: for translating ufl operators to target language
        # - symbols: for translating ufl operators to target language
        # - definitions: for defining backend specific variables
        # - access: for accessing backend specific variables
        self.backend = backend

        # Set of operator names code has been generated for,
        # used in the end for selecting necessary includes
        self._ufl_names = set()

        # Initialize lookup tables for variable scopes
        self.init_scopes()

        # Cache of reusable blocks contributing to A
        self.shared_blocks = {}

        # Block contributions collected during generation to be added to A at the end
        self.finalization_blocks = defaultdict(list)

        # Set of counters used for assigning names to intermediate variables
        # TODO: Should this be part of the backend symbols? Doesn't really matter now.
        self.symbol_counters = defaultdict(int)

        # Whether pre-integration tables should be inlined - or added as static arrays
        self._inline_tables = None
        # Whether to unroll the tabulate_tensor assignments
        self._unroll_tables = None
        # The maximum size of any preintegration table in order to perform unrolling
        self.max_unroll_table_size = 1024


[docs]    def get_includes(self):
        "Return list of include statements needed to support generated code."
        includes = set()

        # Get std::fill used by MemZero
        includes.add("#include <algorithm>")

        # For controlling floating point environment
        #includes.add("#include <cfenv>")

        # For intel intrinsics and controlling floating point environment
        #includes.add("#include <xmmintrin.h>")

        cmath_names = set((
                "abs", "sign", "pow", "sqrt",
                "exp", "ln",
                "cos", "sin", "tan",
                "acos", "asin", "atan", "atan_2",
                "cosh", "sinh", "tanh",
                "acosh", "asinh", "atanh",
                "erf", "erfc",
            ))

        boost_math_names = set((
            "bessel_j", "bessel_y", "bessel_i", "bessel_k",
            ))

        # Only return the necessary headers
        if cmath_names & self._ufl_names:
            includes.add("#include <cmath>")

        if boost_math_names & self._ufl_names:
            includes.add("#include <boost/math/special_functions.hpp>")

        return sorted(includes)


[docs]    def init_scopes(self):
        "Initialize variable scope dicts."
        # Reset variables, separate sets for quadrature loop
        self.scopes = { num_points: {} for num_points in self.ir["all_num_points"] }
        self.scopes[None] = {}


[docs]    def set_var(self, num_points, v, vaccess):
        """"Set a new variable in variable scope dicts.

        Scope is determined by num_points which identifies the
        quadrature loop scope or None if outside quadrature loops.

        v is the ufl expression and vaccess is the CNodes
        expression to access the value in the code.
        """
        self.scopes[num_points][v] = vaccess


[docs]    def has_var(self, num_points, v):
        """"Check if variable exists in variable scope dicts.

        Return True if ufl expression v exists in the num_points scope.

        NB! Does not fall back to piecewise scope.
        """
        return v in self.scopes[num_points]


[docs]    def get_var(self, num_points, v):
        """"Lookup ufl expression v in variable scope dicts.

        Scope is determined by num_points which identifies the
        quadrature loop scope or None if outside quadrature loops.

        If v is not found in quadrature loop scope, the piecewise
        scope (None) is checked.

        Returns the CNodes expression to access the value in the code.
        """
        if v._ufl_is_literal_:
            return self.backend.ufl_to_language(v)
        f = self.scopes[num_points].get(v)
        if f is None:
            f = self.scopes[None][v]
        return f


[docs]    def new_temp_symbol(self, basename):
        "Create a new code symbol named basename + running counter."
        L = self.backend.language
        name = "%s%d" % (basename, self.symbol_counters[basename])
        self.symbol_counters[basename] += 1
        return L.Symbol(name)


[docs]    def get_temp_symbol(self, tempname, key):
        key = (tempname,) + key
        s = self.shared_blocks.get(key)
        defined = s is not None
        if not defined:
            s = self.new_temp_symbol(tempname)
            self.shared_blocks[key] = s
        return s, defined


[docs]    def generate(self):
        """Generate entire tabulate_tensor body.

        Assumes that the code returned from here will be wrapped in a context
        that matches a suitable version of the UFC tabulate_tensor signatures.
        """
        L = self.backend.language

        # Assert that scopes are empty: expecting this to be called only once
        assert not any(d for d in self.scopes.values())

        parts = []

        # TODO: Is this needed? Find a test case to check.
        parts += [
            #L.VerbatimStatement("_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);"),
            #L.VerbatimStatement("std::fesetenv(FE_DFL_DISABLE_SSE_DENORMS_ENV);"),
            ]

        # Generate the tables of quadrature points and weights
        parts += self.generate_quadrature_tables()

        # Generate the tables of basis function values and preintegrated blocks
        parts += self.generate_element_tables()

        # Generate code to compute piecewise constant scalar factors
        parts += self.generate_unstructured_piecewise_partition()

        # Loop generation code will produce parts to go before quadloops,
        # to define the quadloops, and to go after the quadloops
        all_preparts = []
        all_quadparts = []
        all_postparts = []

        # Go through each relevant quadrature loop
        if self.ir["integral_type"] in custom_integral_types:
            preparts, quadparts, postparts = \
              self.generate_runtime_quadrature_loop()
            all_preparts += preparts
            all_quadparts += quadparts
            all_postparts += postparts
        else:
            for num_points in self.ir["all_num_points"]:
                # Generate code to integrate reusable blocks of final element tensor
                preparts, quadparts, postparts = \
                    self.generate_quadrature_loop(num_points)
                all_preparts += preparts
                all_quadparts += quadparts
                all_postparts += postparts

        # Generate code to finish computing reusable blocks outside quadloop
        preparts, quadparts, postparts = \
            self.generate_dofblock_partition(None)
        all_preparts += preparts
        all_quadparts += quadparts
        all_postparts += postparts

        # Generate code to fill in A
        all_finalizeparts = []

        # Generate code to set A = 0
        #all_finalizeparts += self.generate_tensor_reset()

        # Generate code to compute piecewise constant scalar factors
        # and set A at corresponding nonzero components
        all_finalizeparts += self.generate_preintegrated_dofblock_partition()

        # Generate code to add reusable blocks B* to element tensor A
        all_finalizeparts += self.generate_copyout_statements()

        # Collect parts before, during, and after quadrature loops
        parts += all_preparts
        parts += all_quadparts
        parts += all_postparts
        parts += all_finalizeparts

        return L.StatementList(parts)


[docs]    def generate_quadrature_tables(self):
        "Generate static tables of quadrature points and weights."
        L = self.backend.language

        parts = []

        # No quadrature tables for custom (given argument)
        # or point (evaluation in single vertex)
        skip = custom_integral_types + point_integral_types
        if self.ir["integral_type"] in skip:
            return parts

        alignas = self.ir["params"]["alignas"]
        padlen = self.ir["params"]["padlen"]

        # Loop over quadrature rules
        for num_points in self.ir["all_num_points"]:
            varying_ir = self.ir["varying_irs"][num_points]

            points, weights = self.ir["quadrature_rules"][num_points]
            assert num_points == len(weights)
            assert num_points == points.shape[0]

            # Generate quadrature weights array
            if varying_ir["need_weights"]:
                wsym = self.backend.symbols.weights_table(num_points)
                parts += [L.ArrayDecl("static const double", wsym,
                                      num_points, weights,
                                      alignas=alignas)]

            # Generate quadrature points array
            N = product(points.shape)
            if varying_ir["need_points"] and N:
                # Flatten array: (TODO: avoid flattening here, it makes padding harder)
                flattened_points = points.reshape(N)
                psym = self.backend.symbols.points_table(num_points)
                parts += [L.ArrayDecl("static const double", psym,
                                      N, flattened_points,
                                      alignas=alignas)]

        # Add leading comment if there are any tables
        parts = L.commented_code_list(parts, "Quadrature rules")
        return parts


[docs]    def generate_element_tables(self):
        """Generate static tables with precomputed element basis
        function values in quadrature points."""
        L = self.backend.language
        parts = []

        tables = self.ir["unique_tables"]
        table_types = self.ir["unique_table_types"]

        alignas = self.ir["params"]["alignas"]
        padlen = self.ir["params"]["padlen"]

        if self.ir["integral_type"] in custom_integral_types:
            # Define only piecewise tables
            table_names = [name for name in sorted(tables)
                           if table_types[name] in piecewise_ttypes]
        else:
            # Define all tables
            table_names = sorted(tables)

        # Check whether it is ok to unroll all assignments involving the tables
        # TODO : Update the condition to be compatible with mixed-dimensional problems,
        # inline_tables with integral_type = cell should be false in that case
        # self._inline_tables = self.ir["integral_type"] == "cell"
        self._inline_tables = False

        self._unroll_tables = True
        for name in table_names:
            table = tables[name]

            if table.size > self.max_unroll_table_size:
                self._inline_tables = False
                self._unroll_tables = False
                break

        for name in table_names:
            table = tables[name]

            # Don't pad preintegrated tables
            if name[0] == "P":
                p = 1
            else:
                p = padlen

            # Skip tables that are inlined in code generation
            if self._inline_tables and name[:2] == "PI":
                continue

            decl = L.ArrayDecl("static const double", name,
                               table.shape, table,
                               alignas=alignas,
                               padlen=p)
            parts += [decl]

        # Add leading comment if there are any tables
        parts = L.commented_code_list(parts, [
            "Precomputed values of basis functions and precomputations",
            "FE* dimensions: [entities][points][dofs]",
            "PI* dimensions: [entities][dofs][dofs] or [entities][dofs]",
            "PM* dimensions: [entities][dofs][dofs]",
            ])
        return parts


[docs]    def generate_tensor_reset(self):
        "Generate statements for resetting the element tensor to zero."
        L = self.backend.language
        A = self.backend.symbols.element_tensor()
        A_shape = self.ir["tensor_shape"]
        A_size = product(A_shape)
        parts = [
            L.Comment("Reset element tensor"),
            L.MemZero(A, A_size),
            ]
        return parts


[docs]    def generate_quadrature_loop(self, num_points):
        "Generate quadrature loop with for this num_points."
        L = self.backend.language

        # Generate unstructured varying partition
        body = self.generate_unstructured_varying_partition(num_points)
        body = L.commented_code_list(body,
            "Quadrature loop body setup (num_points={0})".format(num_points))

        # Generate dofblock parts, some of this
        # will be placed before or after quadloop
        preparts, quadparts, postparts = \
            self.generate_dofblock_partition(num_points)
        body += quadparts

        # Wrap body in loop or scope
        if not body:
            # Could happen for integral with everything zero and optimized away
            quadparts = []
        elif num_points == 1:
            # For now wrapping body in Scope to avoid thinking about scoping issues
            quadparts = L.commented_code_list(L.Scope(body),
                "Only 1 quadrature point, no loop")
        else:
            # Regular case: define quadrature loop
            if num_points == 1:
                iq = 0
            else:
                iq = self.backend.symbols.quadrature_loop_index()
            quadparts = [L.ForRange(iq, 0, num_points, body=body)]

        return preparts, quadparts, postparts


[docs]    def generate_runtime_quadrature_loop(self):
        "Generate quadrature loop for custom integrals, with physical points given runtime."
        L = self.backend.language

        assert self.ir["integral_type"] in custom_integral_types

        num_points = self.ir["fake_num_points"]
        chunk_size = self.ir["params"]["chunk_size"]

        gdim = self.ir["geometric_dimension"]
        tdim = self.ir["topological_dimension"]

        alignas = self.ir["params"]["alignas"]
        #padlen = self.ir["params"]["padlen"]

        tables = self.ir["unique_tables"]
        table_types = self.ir["unique_table_types"]
        #table_origins = self.ir["unique_table_origins"]  # FIXME

        # Generate unstructured varying partition
        body = self.generate_unstructured_varying_partition(num_points)
        body = L.commented_code_list(body,
            ["Run-time quadrature loop body setup",
             "(chunk_size={0}, analysis_num_points={1})".format(
                 chunk_size, num_points)])

        # Generate dofblock parts, some of this
        # will be placed before or after quadloop
        preparts, quadparts, postparts = \
            self.generate_dofblock_partition(num_points)
        body += quadparts

        # Wrap body in loop
        if not body:
            # Could happen for integral with everything zero and optimized away
            quadparts = []
        else:
            rule_parts = []

            # Define two-level quadrature loop; over chunks then over points in chunk
            iq_chunk = L.Symbol("iq_chunk")
            np = self.backend.symbols.num_custom_quadrature_points()
            num_point_blocks = (np + chunk_size - 1) / chunk_size
            iq = self.backend.symbols.quadrature_loop_index()

            # Not assuming runtime size to be multiple by chunk size
            num_points_in_block = L.Symbol("num_points_in_chunk")
            decl = L.VariableDecl("const int", num_points_in_block,
                                  L.Call("min", (chunk_size, np - iq_chunk * chunk_size)))
            rule_parts.append(decl)

            iq_body = L.ForRange(iq, 0, num_points_in_block, body=body)


            ### Preparations for quadrature rules
            # 
            varying_ir = self.ir["varying_irs"][num_points]

            # Copy quadrature weights for this chunk
            if varying_ir["need_weights"]:
                cwsym = self.backend.symbols.custom_quadrature_weights()
                wsym = self.backend.symbols.custom_weights_table()
                rule_parts += [
                    L.ArrayDecl("double", wsym, chunk_size, 0,
                                alignas=alignas),
                    L.ForRange(iq, 0, num_points_in_block,
                               body=L.Assign(wsym[iq], cwsym[chunk_size*iq_chunk + iq])),
                    ]

            # Copy quadrature points for this chunk
            if varying_ir["need_points"]:
                cpsym = self.backend.symbols.custom_quadrature_points()
                psym = self.backend.symbols.custom_points_table()
                rule_parts += [
                    L.ArrayDecl("double", psym, chunk_size * gdim, 0,
                                alignas=alignas),
                    L.ForRange(iq, 0, num_points_in_block,
                        body=[L.Assign(psym[iq*gdim + i],
                                       cpsym[chunk_size*iq_chunk*gdim + iq*gdim + i])
                              for i in range(gdim)])
                    ]

            # Add leading comment if there are any tables
            rule_parts = L.commented_code_list(rule_parts,
                "Quadrature weights and points")


            ### Preparations for element tables
            table_parts = []

            # Only declare non-piecewise tables, computed inside chunk loop
            non_piecewise_tables = [name for name in sorted(tables)
                                    if table_types[name] not in piecewise_ttypes]
            for name in non_piecewise_tables:
                table = tables[name]
                decl = L.ArrayDecl("double", name,
                                   (1, chunk_size, table.shape[2]), 0,
                                   alignas=alignas)  # padlen=padlen)
                table_parts += [decl]

            table_parts += [L.Comment("FIXME: Fill element tables here")]
            #table_origins

            ### Gather all in chunk loop
            chunk_body = rule_parts + table_parts + [iq_body]
            quadparts = [L.ForRange(iq_chunk, 0, num_point_blocks, body=chunk_body)]

        return preparts, quadparts, postparts


[docs]    def generate_unstructured_piecewise_partition(self):
        L = self.backend.language

        num_points = None
        expr_ir = self.ir["piecewise_ir"]

        name = "sp"
        arraysymbol = L.Symbol(name)
        parts = self.generate_partition(arraysymbol,
                                        expr_ir["V"],
                                        expr_ir["V_active"],
                                        expr_ir["V_mts"],
                                        expr_ir["mt_tabledata"],
                                        num_points)
        parts = L.commented_code_list(parts,
            "Unstructured piecewise computations")
        return parts


[docs]    def generate_unstructured_varying_partition(self, num_points):
        L = self.backend.language

        expr_ir = self.ir["varying_irs"][num_points]

        name = "sv"
        arraysymbol = L.Symbol("%s%d" % (name, num_points))
        parts = self.generate_partition(arraysymbol,
                                        expr_ir["V"],
                                        expr_ir["V_varying"],
                                        expr_ir["V_mts"],
                                        expr_ir["mt_tabledata"],
                                        num_points)
        parts = L.commented_code_list(parts,
            "Unstructured varying computations for num_points=%d" % (num_points,))
        return parts


[docs]    def generate_partition(self, symbol, V, V_active, V_mts, mt_tabledata, num_points):
        L = self.backend.language

        definitions = []
        intermediates = []

        active_indices = [i for i, p in enumerate(V_active) if p]

        for i in active_indices:
            v = V[i]
            mt = V_mts[i]

            if v._ufl_is_literal_:
                vaccess = self.backend.ufl_to_language(v)
            elif mt is not None:
                # All finite element based terminals has table data, as well
                # as some but not all of the symbolic geometric terminals
                tabledata = mt_tabledata.get(mt)

                # Backend specific modified terminal translation
                vaccess = self.backend.access(mt.terminal,
                    mt, tabledata, num_points)
                vdef = self.backend.definitions(mt.terminal,
                    mt, tabledata, num_points, vaccess)

                # Store definitions of terminals in list
                assert isinstance(vdef, list)
                definitions.extend(vdef)
            else:
                # Get previously visited operands
                vops = [self.get_var(num_points, op) for op in v.ufl_operands]

                # Mapping UFL operator to target language
                self._ufl_names.add(v._ufl_handler_name_)
                vexpr = self.backend.ufl_to_language(v, *vops)

                # TODO: Let optimized ir provide mapping of vertex indices to
                # variable indices, marking which subexpressions to store in variables
                # and in what order:
                #j = variable_id[i]

                # Currently instead creating a new intermediate for
                # each subexpression except boolean conditions
                if isinstance(v, Condition):
                    # Inline the conditions x < y, condition values
                    # 'x' and 'y' may still be stored in intermediates.
                    # This removes the need to handle boolean intermediate variables.
                    # With tensor-valued conditionals it may not be optimal but we
                    # let the C++ compiler take responsibility for optimizing those cases.
                    j = None
                elif any(op._ufl_is_literal_ for op in v.ufl_operands):
                    # Skip intermediates for e.g. -2.0*x,
                    # resulting in lines like z = y + -2.0*x
                    j = None
                else:
                    j = len(intermediates)

                if j is not None:
                    # Record assignment of vexpr to intermediate variable
                    if self.ir["params"]["use_symbol_array"]:
                        vaccess = symbol[j]
                        intermediates.append(L.Assign(vaccess, vexpr))
                    else:
                        vaccess = L.Symbol("%s_%d" % (symbol.name, j))
                        intermediates.append(L.VariableDecl("const double", vaccess, vexpr))
                else:
                    # Access the inlined expression
                    vaccess = vexpr

            # Store access node for future reference
            self.set_var(num_points, v, vaccess)

        # Join terminal computation, array of intermediate expressions,
        # and intermediate computations
        parts = []
        if definitions:
            parts += definitions
        if intermediates:
            if self.ir["params"]["use_symbol_array"]:
                alignas = self.ir["params"]["alignas"]
                parts += [L.ArrayDecl("double", symbol,
                                      len(intermediates),
                                      alignas=alignas)]
            parts += intermediates
        return parts


[docs]    def generate_dofblock_partition(self, num_points):
        L = self.backend.language

        if num_points is None:  # NB! None meaning piecewise partition, not custom integral
            block_contributions = self.ir["piecewise_ir"]["block_contributions"]
        else:
            block_contributions = self.ir["varying_irs"][num_points]["block_contributions"]

        preparts = []
        quadparts = []
        postparts = []

        blocks = [(blockmap, blockdata)
                  for blockmap, contributions in sorted(block_contributions.items())
                  for blockdata in contributions
                  if blockdata.block_mode != "preintegrated"]

        for blockmap, blockdata in blocks:
            # Get symbol for already defined block B if it exists
            common_block_data = get_common_block_data(blockdata)
            B = self.shared_blocks.get(common_block_data)
            if B is None:
                # Define code for block depending on mode
                B, block_preparts, block_quadparts, block_postparts = \
                    self.generate_block_parts(num_points, blockmap, blockdata)

                # Add definitions
                preparts.extend(block_preparts)

                # Add computations
                quadparts.extend(block_quadparts)

                # Add finalization
                postparts.extend(block_postparts)

                # Store reference for reuse
                self.shared_blocks[common_block_data] = B

            # Add A[blockmap] += B[...] to finalization
            self.finalization_blocks[blockmap].append(B)

        return preparts, quadparts, postparts


[docs]    def get_entities(self, blockdata):
        L = self.backend.language

        if self.ir["integral_type"] == "interior_facet":
            # Get the facet entities
            entities = []
            for r in blockdata.restrictions:
                if r is None:
                    entities.append(0)
                else:
                    entities.append(self.backend.symbols.entity(self.ir["entitytype"], r))
            if blockdata.transposed:
                return (entities[1], entities[0])
            else:
                return tuple(entities)
        else:
            # Get the current cell or facet entity
            if blockdata.is_uniform:
                # uniform, i.e. constant across facets
                entity = L.LiteralInt(0)
            else:
                entity = self.backend.symbols.entity(self.ir["entitytype"], None)
            return (entity,)


[docs]    def get_arg_factors(self, blockdata, block_rank, num_points, iq, indices):
        L = self.backend.language

        arg_factors = []
        for i in range(block_rank):
            mad = blockdata.ma_data[i]
            td = mad.tabledata
            if td.is_piecewise:
                scope = self.ir["piecewise_ir"]["modified_arguments"]
            else:
                scope = self.ir["varying_irs"][num_points]["modified_arguments"]
            mt = scope[mad.ma_index]

            # Translate modified terminal to code
            # TODO: Move element table access out of backend?
            #       Not using self.backend.access.argument() here
            #       now because it assumes too much about indices.

            table = self.backend.symbols.element_table(td,
                self.ir["entitytype"], mt.restriction)

            assert td.ttype != "zeros"

            if td.ttype == "ones":
                arg_factor = L.LiteralFloat(1.0)
            elif td.ttype == "quadrature":  # TODO: Revisit all quadrature ttype checks
                arg_factor = table[iq]
            else:
                # Assuming B sparsity follows element table sparsity
                arg_factor = table[indices[i]]
            arg_factors.append(arg_factor)
        return arg_factors


[docs]    def generate_block_parts(self, num_points, blockmap, blockdata):
        """Generate and return code parts for a given block.

        Returns parts occuring before, inside, and after
        the quadrature loop identified by num_points.

        Should be called with num_points=None for quadloop-independent blocks.
        """
        L = self.backend.language

        # The parts to return
        preparts = []
        quadparts = []
        postparts = []

        # TODO: Define names in backend symbols?
        #tempnames = self.backend.symbols.block_temp_names
        #blocknames = self.backend.symbols.block_names
        tempnames = {
            #"preintegrated": "TI",
            "premultiplied": "TM",
            "partial": "TP",
            "full": "TF",
            "safe": "TS",
            "quadrature": "TQ",
            }
        blocknames = {
            #"preintegrated": "BI",
            #"premultiplied": "BM",
            #"partial": "BP",
            "full": "BF",
            "safe": "BS",
            "quadrature": "BQ",
            }

        fwtempname = "fw"
        tempname = tempnames.get(blockdata.block_mode)

        alignas = self.ir["params"]["alignas"]
        padlen = self.ir["params"]["padlen"]

        block_rank = len(blockmap)
        blockdims = tuple(len(dofmap) for dofmap in blockmap)
        padded_blockdims = pad_innermost_dim(blockdims, padlen)

        ttypes = blockdata.ttypes
        if "zeros" in ttypes:
            error("Not expecting zero arguments to be left in dofblock generation.")

        if num_points is None:
            iq = None
        elif num_points == 1:
            iq = 0
        else:
            iq = self.backend.symbols.quadrature_loop_index()

        # Override dof index with quadrature loop index for arguments with
        # quadrature element, to index B like B[iq*num_dofs + iq]
        arg_indices = tuple(self.backend.symbols.argument_loop_index(i)
                            for i in range(block_rank))
        B_indices = []
        for i in range(block_rank):
            if ttypes[i] == "quadrature":
                B_indices.append(iq)
            else:
                B_indices.append(arg_indices[i])
        B_indices = tuple(B_indices)

        # Define unique block symbol
        blockname = blocknames.get(blockdata.block_mode)
        if blockname:
            B = self.new_temp_symbol(blockname)
            # Add initialization of this block to parts
            # For all modes, block definition occurs before quadloop
            preparts.append(L.ArrayDecl("double", B, blockdims, 0,
                                        alignas=alignas,
                                        padlen=padlen))

        # Get factor expression
        if blockdata.factor_is_piecewise:
            v = self.ir["piecewise_ir"]["V"][blockdata.factor_index]
        else:
            v = self.ir["varying_irs"][num_points]["V"][blockdata.factor_index]
        f = self.get_var(num_points, v)

        # Quadrature weight was removed in representation, add it back now
        if num_points is None:
            weight = L.LiteralFloat(1.0)
        elif self.ir["integral_type"] in custom_integral_types:
            weights = self.backend.symbols.custom_weights_table()
            weight = weights[iq]
        else:
            weights = self.backend.symbols.weights_table(num_points)
            weight = weights[iq]

        # Define fw = f * weight
        if blockdata.block_mode in ("safe", "full", "partial"):
            assert not blockdata.transposed, "Not handled yet"

            # Fetch code to access modified arguments
            arg_factors = self.get_arg_factors(blockdata, block_rank, num_points, iq, B_indices)

            fw_rhs = L.float_product([f, weight])
            if not isinstance(fw_rhs, L.Product):
                fw = fw_rhs
            else:
                # Define and cache scalar temp variable
                key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise)
                fw, defined = self.get_temp_symbol(fwtempname, key)
                if not defined:
                    quadparts.append(L.VariableDecl("const double", fw, fw_rhs))

                # Plan for vectorization of fw computations over iq:
                # 1) Define fw as arrays e.g. "double fw0[nq];" outside quadloop
                # 2) Access as fw0[iq] of course
                # 3) Split quadrature loops, one for fw computation and one for blocks
                # 4) Pad quadrature rule with 0 weights and last point

                # Plan for vectorization of coefficient evaluation over iq:
                # 1) Define w0_c1 etc as arrays e.g. "double w0_c1[nq] = {};" outside quadloop
                # 2) Access as w0_c1[iq] of course
                # 3) Splitquadrature loops, coefficients before fw computation
                # 4) Possibly swap loops over iq and ic:
                #    for(ic) for(iq) w0_c1[iq] = w[0][ic] * FE[iq][ic];

        if blockdata.block_mode == "safe":
            # Naively accumulate integrand for this block in the innermost loop
            assert not blockdata.transposed
            B_rhs = L.float_product([fw] + arg_factors)
            body = L.AssignAdd(B[B_indices], B_rhs)  # NB! += not =
            for i in reversed(range(block_rank)):
                body = L.ForRange(B_indices[i], 0, padded_blockdims[i], body=body)
            quadparts += [body]

            # Define rhs expression for A[blockmap[arg_indices]] += A_rhs
            A_rhs = B[arg_indices]

        elif blockdata.block_mode == "full":
            assert not blockdata.transposed, "Not handled yet"

            if block_rank < 2:
                # Multiply collected factors
                B_rhs = L.float_product([fw] + arg_factors)
            else:
                # TODO: Pick arg with smallest dimension, or pick
                # based on global optimization to reuse more blocks
                i = 0  # Index selected for precomputation
                j = 1 - i

                P_index = B_indices[i]

                key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise,
                       arg_factors[i].ce_format(self.precision))
                P, defined = self.get_temp_symbol(tempname, key)
                if not defined:
                    # TODO: If FE table is varying and only used in contexts
                    # where it's multiplied by weight, we can premultiply it!
                    # Then this would become P = f * preweighted_FE_table[:].

                    # Define and compute intermediate value
                    # P[:] = (weight * f) * args[i][:]
                    # inside quadrature loop
                    P_dim = blockdims[i]
                    quadparts.append(L.ArrayDecl("double", P, P_dim, None,
                                                 alignas=alignas,
                                                 padlen=padlen))
                    P_rhs = L.float_product([fw, arg_factors[i]])
                    body = L.Assign(P[P_index], P_rhs)
                    #if ttypes[i] != "quadrature":  # FIXME: What does this mean here?
                    vectorize = self.ir["params"]["vectorize"]
                    body = L.ForRange(P_index, 0, P_dim,
                                      body=body, vectorize=vectorize)
                    quadparts.append(body)

                B_rhs = P[P_index] * arg_factors[j]

            # Add result to block inside quadloop
            body = L.AssignAdd(B[B_indices], B_rhs)  # NB! += not =
            for i in reversed(range(block_rank)):
                # Vectorize only the innermost loop
                vectorize = self.ir["params"]["vectorize"] and (i == block_rank - 1)
                if ttypes[i] != "quadrature":
                    body = L.ForRange(B_indices[i], 0, padded_blockdims[i],
                                      body=body, vectorize=vectorize)
            quadparts += [body]

            # Define rhs expression for A[blockmap[arg_indices]] += A_rhs
            A_rhs = B[arg_indices]

        elif blockdata.block_mode == "partial":
            # TODO: To handle transpose here, must add back intermediate block B
            assert not blockdata.transposed, "Not handled yet"

            # Get indices and dimensions right here...
            assert block_rank == 2
            i = blockdata.piecewise_ma_index
            not_piecewise_index = 1 - i

            P_index = arg_indices[not_piecewise_index]

            key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise,
                   arg_factors[not_piecewise_index].ce_format(self.precision))
            P, defined = self.get_temp_symbol(tempname, key)
            if not defined:
                # Declare P table in preparts
                P_dim = blockdims[not_piecewise_index]
                preparts.append(L.ArrayDecl("double", P, P_dim, 0,
                                            alignas=alignas,
                                            padlen=padlen))

                # Multiply collected factors
                P_rhs = L.float_product([fw, arg_factors[not_piecewise_index]])

                # Accumulate P += weight * f * args in quadrature loop
                body = L.AssignAdd(P[P_index], P_rhs)
                body = L.ForRange(P_index, 0, pad_dim(P_dim, padlen), body=body)
                quadparts.append(body)

            # Define B = B_rhs = piecewise_argument[:] * P[:], where P[:] = sum_q weight * f * other_argument[:]
            B_rhs = arg_factors[i] * P[P_index]

            # Define rhs expression for A[blockmap[arg_indices]] += A_rhs
            A_rhs = B_rhs

        elif blockdata.block_mode in ("premultiplied", "preintegrated"):
            P_entity_indices = self.get_entities(blockdata)
            if blockdata.transposed:
                P_block_indices = (arg_indices[1], arg_indices[0])
            else:
                P_block_indices = arg_indices
            P_ii = P_entity_indices + P_block_indices

            if blockdata.block_mode == "preintegrated":
                # Preintegrated should never get into quadloops
                assert num_points is None

                # Define B = B_rhs = f * PI where PI = sum_q weight * u * v
                PI = L.Symbol(blockdata.name)[P_ii]
                B_rhs = L.float_product([f, PI])

            elif blockdata.block_mode == "premultiplied":
                key = (num_points, blockdata.factor_index, blockdata.factor_is_piecewise)
                FI, defined = self.get_temp_symbol(tempname, key)
                if not defined:
                    # Declare FI = 0 before quadloop
                    preparts += [L.VariableDecl("double", FI, 0)]
                    # Accumulate FI += weight * f in quadparts
                    quadparts += [L.AssignAdd(FI, L.float_product([weight, f]))]

                # Define B_rhs = FI * PM where FI = sum_q weight*f, and PM = u * v
                PM = L.Symbol(blockdata.name)[P_ii]
                B_rhs = L.float_product([FI, PM])

            # Define rhs expression for A[blockmap[arg_indices]] += A_rhs
            A_rhs = B_rhs

        return A_rhs, preparts, quadparts, postparts


[docs]    def generate_preintegrated_dofblock_partition(self):
        # FIXME: Generalize this to unrolling all A[] += ... loops, or all loops with noncontiguous DM??
        L = self.backend.language
        block_contributions = self.ir["piecewise_ir"]["block_contributions"]

        blocks = [(blockmap, blockdata)
                  for blockmap, contributions in sorted(block_contributions.items())
                  for blockdata in contributions
                  if blockdata.block_mode == "preintegrated"]

        # Get symbol, dimensions, and loop index symbols for A
        A = self.backend.symbols.element_tensor()
        A_shape = self.ir["tensor_shape"]
        A_size = product(A_shape)
        A_strides = shape_to_strides(A_shape)

        # List for unrolled assignments
        A_values = [0.0] * A_size
        # List of statements when not unrolling
        parts = []

        for block_id, (blockmap, blockdata) in enumerate(blocks):
            # Accumulate A[blockmap[...]] += f*PI[...]

            # Get table for inlining
            tables = self.ir["unique_tables"]
            table = tables[blockdata.name]

            # TODO : Update the condition to be compatible with mixed-dimensional problems,
            # inline_tables with integral_type = cell should be false in that case
            # inline_table = self.ir["integral_type"] == "cell"
            self._inline_table = False

            # Get factor expression
            v = self.ir["piecewise_ir"]["V"][blockdata.factor_index]
            f = self.get_var(None, v)

            # Define rhs expression for A[blockmap[arg_indices]] += A_rhs
            # A_rhs = f * PI where PI = sum_q weight * u * v
            PI = L.Symbol(blockdata.name)
            block_rank = len(blockmap)

            # Indices used to index A
            A_index_symbols = tuple(self.backend.symbols.argument_loop_index(i)
                                for i in range(block_rank))
            # Indices used to index PI
            P_index_symbols = tuple(self.backend.symbols.argument_loop_index(i)
                                for i in range(block_rank, 2*block_rank))

            # Define indices into preintegrated block
            P_entity_indices = self.get_entities(blockdata)
            if self._inline_tables:
                assert P_entity_indices == (L.LiteralInt(0),)
                assert table.shape[0] == 1

            # Unroll loop
            blockshape = [len(DM) for DM in blockmap]
            blockrange = [range(d) for d in blockshape]

            if self._unroll_tables:
                # Generate unrolled assignments for the current block

                for ii in itertools.product(*blockrange):
                    A_ii = sum(A_strides[i] * blockmap[i][ii[i]]
                               for i in range(len(ii)))
                    if blockdata.transposed:
                        P_arg_indices = (ii[1], ii[0])
                    else:
                        P_arg_indices = ii

                    if self._inline_tables:
                        # Extract float value of PI[P_ii]
                        Pval = table[0]  # always entity 0
                        for i in P_arg_indices:
                            Pval = Pval[i]
                        A_rhs = Pval * f
                    else:
                        # Index the static preintegrated table:
                        P_ii = P_entity_indices + P_arg_indices
                        A_rhs = f * PI[P_ii]

                    A_values[A_ii] = A_values[A_ii] + A_rhs
            else:
                # Don't unroll, generate assignment loops
                # TODO: Allow mixed unrolled/non unrolled assignments?

                # Make sure that we can use PI as static array
                assert not self._inline_tables

                # Accumulate index expression 'A_idx' used as 'A[A_idx]' in the loop, e.g. A_idx = i*3 + j
                A_idx = sum(A_strides[i] * index_symbol for i, index_symbol in enumerate(A_index_symbols))

                # Get the index tuple used to index the pre-integrated table
                P_arg_indices = P_index_symbols
                # Transpose the PI indices, if block is transposed
                if blockdata.transposed:
                    P_arg_indices = tuple(reversed(P_arg_indices))

                # Initialize the central assignment statement
                P_ii = P_entity_indices + P_arg_indices
                A_rhs = f * PI[P_ii]
                loop_code = L.AssignAdd(A[A_idx], A_rhs)

                # Construct nested loops, starting with inner-most dimension
                for sub_blockmap, A_arg_index, P_arg_index in zip(reversed(blockmap), reversed(A_index_symbols),
                                                                  reversed(P_index_symbols)):
                    # Append increment of PI index to inner code
                    loop_code = [loop_code, L.PreIncrement(P_arg_index)]

                    # Check whether the current bock map is a contiguous range
                    if (sub_blockmap[-1] - sub_blockmap[0] + 1) == len(sub_blockmap):
                        # We have contiguous indices in the blockmap, a single loop is enough
                        loop_code = [L.ForRange(A_arg_index, sub_blockmap[0], sub_blockmap[-1] + 1, loop_code)]
                    else:
                        # Split blockmap into contiguous sub-ranges -> multiple loops

                        loop_block = []
                        # The groupby statement yields these contiguous sub-ranges
                        for k, group in itertools.groupby(enumerate(sub_blockmap), key=lambda x: x[0] - x[1]):
                            contiguous_range = list(group)
                            loop_interval = contiguous_range[0][1], contiguous_range[-1][1] + 1

                            # Check whether we really need a loop
                            if loop_interval[1] - loop_interval[0] > 1:
                                loop_block += [L.ForRange(A_arg_index, loop_interval[0], loop_interval[1], loop_code)]
                            else:
                                # TODO: Remove scope by explicitly replacing Symbol A_arg_index with Literal loop_interval[0] in all subexpression
                                loop_block += [L.Scope([
                                    L.VariableDecl("int", A_arg_index, loop_interval[0]),
                                    loop_code]
                                )]
                        loop_code = loop_block

                    # Reset the currently inner-most PI table index
                    loop_code += [L.Assign(P_arg_index, 0)]

                # Define the P index variables before the loop code
                loop_code = [L.VariableDecl("int", P_arg_index, 0) for P_arg_index in P_arg_indices] + loop_code
                # Add a comment and scope to keep index variables enclosed
                parts += [L.Scope(loop_code)]

        if self._unroll_tables:
            parts = self.generate_tensor_value_initialization(A_values)
        else:
            # If we have assignment loops, zero all entries of A first
            parts = [L.MemZero(A, A_size)] + parts

        return parts


[docs]    def generate_tensor_value_initialization(self, A_values):
        parts = []

        L = self.backend.language
        A = self.backend.symbols.element_tensor()
        A_size = len(A_values)

        init_mode = self.ir["params"]["tensor_init_mode"]
        z = L.LiteralFloat(0.0)

        if init_mode == "direct":
            # Generate A[i] = A_values[i] including zeros
            for i in range(A_size):
                parts += [L.Assign(A[i], A_values[i])]
        elif init_mode == "upfront":
            # Zero everything first
            parts += [L.MemZero(A, A_size)]

            # Generate A[i] = A_values[i] skipping zeros
            for i in range(A_size):
                if not (A_values[i] == 0.0 or A_values[i] == z):
                    parts += [L.Assign(A[i], A_values[i])]
        elif init_mode == "interleaved":
            # Generate A[i] = A_values[i] with interleaved zero filling
            i = 0
            zero_begin = 0
            zero_end = zero_begin
            while i < A_size:
                if A_values[i] == 0.0 or A_values[i] == z:
                    # Update range of A zeros
                    zero_end = i + 1
                else:
                    # Set zeros of A just prior to A[i]
                    if zero_end == zero_begin + 1:
                        parts += [L.Assign(A[zero_begin], 0.0)]
                    elif zero_end > zero_begin:
                        parts += [L.MemZeroRange(A, zero_begin, zero_end)]
                    zero_begin = i + 1
                    zero_end = zero_begin
                    # Set A[i] value
                    parts += [L.Assign(A[i], A_values[i])]
                i += 1
            if zero_end == zero_begin + 1:
                parts += [L.Assign(A[zero_begin], 0.0)]
            elif zero_end > zero_begin:
                parts += [L.MemZeroRange(A, zero_begin, zero_end)]
        else:
            error("Invalid init_mode parameter %s" % (init_mode,))

        return parts


[docs]    def generate_expr_copyout_statements(self):
        L = self.backend.language
        parts = []

        # Not expecting any quadrature loop scopes here
        assert tuple(self.scopes.keys()) == (None,)

        # TODO: Get symbol from backend
        values = L.Symbol("values")

        # TODO: Allow expression compilation to compute multiple points at once!
        # Similarities to custom integrals in that points are given,
        # while different in output format: results are not accumulated
        # for each point but stored in output array instead.

        # Assign computed results to output variables
        pir = self.ir["piecewise_ir"]
        V = pir["V"]
        V_targets = pir["V_targets"]
        for i, fi in enumerate(V_targets):
            parts.append(L.Assign(values[i], self.get_var(None, V[fi])))

        return parts


[docs]    def generate_tensor_copyout_statements(self):
        L = self.backend.language
        parts = []

        # Get symbol, dimensions, and loop index symbols for A
        A_shape = self.ir["tensor_shape"]
        A_size = product(A_shape)
        A_rank = len(A_shape)

        A_strides = [1]*A_rank  # TODO: there's something like shape2strides(A_shape) somewhere
        for i in reversed(range(0, A_rank-1)):
            A_strides[i] = A_strides[i+1] * A_shape[i+1]

        Asym = self.backend.symbols.element_tensor()
        A = L.FlattenedArray(Asym, dims=A_shape)

        indices = [self.backend.symbols.argument_loop_index(i)
                   for i in range(A_rank)]

        dofmap_parts = []
        dofmaps = {}
        for blockmap, contributions in sorted(self.finalization_blocks.items()):

            # Define mapping from B indices to A indices
            A_indices = []
            for i in range(A_rank):
                dofmap = blockmap[i]
                begin = dofmap[0]
                end = dofmap[-1] + 1
                if len(dofmap) == end - begin:
                    # Dense insertion, offset B index to index A
                    j = indices[i] + begin
                else:
                    # Sparse insertion, map B index through dofmap
                    DM = dofmaps.get(dofmap)
                    if DM is None:
                        DM = L.Symbol("DM%d" % len(dofmaps))
                        dofmaps[dofmap] = DM
                        dofmap_parts.append(L.ArrayDecl("static const int", DM, len(dofmap), dofmap))
                    j = DM[indices[i]]
                A_indices.append(j)
            A_indices = tuple(A_indices)

            # Sum up all blocks contributing to this blockmap
            term = L.Sum([B_rhs for B_rhs in contributions])

            # TODO: need ttypes associated with this block to deal
            # with loop dropping for quadrature elements:
            ttypes = ()
            if ttypes == ("quadrature", "quadrature"):
                debug("quadrature element block insertion not optimized")

            # Add components of all B's to A component in loop nest
            body = L.AssignAdd(A[A_indices], term)
            for i in reversed(range(A_rank)):
                body = L.ForRange(indices[i], 0, len(blockmap[i]), body=body)

            # Add this block to parts
            parts.append(body)

        # Place static dofmap tables first
        parts = dofmap_parts + parts

        return parts


[docs]    def generate_copyout_statements(self):
        """Generate statements copying results to output array."""
        if self.ir["integral_type"] == "expression":
            return self.generate_expr_copyout_statements()
        #elif self.ir["unroll_copyout_loops"]:
        #    return self.generate_unrolled_tensor_copyout_statements()
        else:
            return self.generate_tensor_copyout_statements()