You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2147 lines
79 KiB
2147 lines
79 KiB
// Copyright(C) 1999-2023 National Technology & Engineering Solutions
// of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
// NTESS, the U.S. Government retains certain rights in this software.
// See packages/seacas/LICENSE for details
#include <SL_SystemInterface.h>
#include <SL_Version.h>
#include <SL_tokenize.h>
#include <Ioss_ChainGenerator.h>
#include <Ioss_CodeTypes.h>
#include <Ioss_CopyDatabase.h>
#include <Ioss_DatabaseIO.h>
#include <Ioss_FileInfo.h>
#include <Ioss_MemoryUtils.h>
#include <Ioss_MeshCopyOptions.h>
#include <Ioss_Region.h>
#include <Ioss_SubSystem.h>
#include <Ioss_SurfaceSplit.h>
#include <Ioss_Utils.h>
#include <cassert>
#include <fmt/format.h>
#include <fmt/ostream.h>
#include <init/Ionit_Initializer.h>
#include <exodusII.h>
#include <algorithm>
#include <array>
#include <chrono>
#include <cstddef>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <map>
#include <numeric>
#include <random>
#include <string>
#include <vector>
#include <metis.h>
using idx_t = int;
#include <sys/types.h>
#include <mpi.h>
// ========================================================================
// TODO(gdsjaar):
// * Sideset distribution factors
// * Variables
// * All entity types
// * More efficient border-node-processor communication map.
// ========================================================================
extern double seacas_timer();
int debug_level = 0;
// size_t partial_count = 1'00'000;
size_t partial_count = 1'000'000'000;
namespace {
void progress(const std::string &output)
static auto start = std::chrono::steady_clock::now();
if ((debug_level & 1) != 0) {
auto now = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = now - start;
fmt::print(stderr, " [{:.2f} - {}]\t{}\n", diff.count(),
fmt::group_digits(Ioss::MemoryUtils::get_memory_info()), output);
void proc_progress(int p, int proc_count)
if (((debug_level & 8) != 0) && ((proc_count <= 20) || ((p + 1) % (proc_count / 20) == 0))) {
progress("\t\tProcessor " + std::to_string(p + 1));
// Add the chain maps for file-per-rank output...
template <typename INT>
void output_chain_maps(std::vector<Ioss::Region *> &proc_region, const Ioss::chain_t<INT> &chains,
const std::vector<int> &elem_to_proc, size_t proc_begin, size_t proc_size,
INT /* dummy */)
size_t block_count = proc_region[0]->get_property("element_block_count").get_int();
size_t offset = 0;
for (size_t b = 0; b < block_count; b++) {
if (debug_level & 4) {
progress("\tBlock " + std::to_string(b + 1));
size_t proc_count = proc_region.size();
std::vector<std::vector<INT>> map(proc_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
const auto &proc_ebs = proc_region[p]->get_element_blocks();
size_t proc_element_count = proc_ebs[b]->entity_count();
map[p].reserve(proc_element_count * 2);
size_t global_element_count = elem_to_proc.size();
for (size_t j = 0; j < global_element_count; j++) {
size_t p = elem_to_proc[offset + j];
if (p >= proc_begin && p < proc_begin + proc_size) {
auto &chain_entry = chains[j + offset];
// TODO: Map this from global to local element number...
size_t loc_elem =
offset += global_element_count;
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
const auto &proc_ebs = proc_region[p]->get_element_blocks();
proc_ebs[b]->put_field_data("chain", map[p]);
proc_progress(p, proc_count);
void add_chain_maps(Ioss::Region ®ion)
ex_put_map_param(region.get_database()->get_file_pointer(), 0, 2);
ex_put_name(region.get_database()->get_file_pointer(), EX_ELEM_MAP, 1, "chain:root_element_id");
ex_put_name(region.get_database()->get_file_pointer(), EX_ELEM_MAP, 2, "chain:depth_from_root");
// The chain / line data will be stored as an element map...
const auto &blocks = region.get_element_blocks();
for (const auto &block : blocks) {
auto field =
Ioss::Field("chain", region.field_int_type(), "Real[2]", Ioss::Field::MAP).set_index(1);
void add_decomp_map(Ioss::Region ®ion, const std::string &decomp_variable_name,
bool add_chain_info)
if (add_chain_info) {
ex_put_map_param(region.get_database()->get_file_pointer(), 0, 3);
ex_put_name(region.get_database()->get_file_pointer(), EX_ELEM_MAP, 1,
ex_put_name(region.get_database()->get_file_pointer(), EX_ELEM_MAP, 2,
ex_put_name(region.get_database()->get_file_pointer(), EX_ELEM_MAP, 3,
else {
ex_put_map_param(region.get_database()->get_file_pointer(), 0, 1);
ex_put_name(region.get_database()->get_file_pointer(), EX_ELEM_MAP, 1,
// The chain / line data will be stored as an element map...
const auto &blocks = region.get_element_blocks();
for (const auto &block : blocks) {
auto field =
Ioss::Field(decomp_variable_name, Ioss::Field::INT32, IOSS_SCALAR(), Ioss::Field::MAP)
if (add_chain_info) {
auto ch_field =
Ioss::Field("chain", region.field_int_type(), "Real[2]", Ioss::Field::MAP).set_index(2);
template <typename INT>
void output_decomp_map(Ioss::Region ®ion, const std::vector<int> &elem_to_proc,
const Ioss::chain_t<INT> &chains, const std::string &decomp_variable_name,
bool add_chain_info)
const auto &blocks = region.get_element_blocks();
size_t offset = 0;
for (const auto &block : blocks) {
size_t num_elem = block->entity_count();
block->put_field_data(decomp_variable_name, (void *)&elem_to_proc[offset], -1);
if (add_chain_info) {
std::vector<INT> chain;
chain.reserve(num_elem * 2);
for (size_t i = 0; i < num_elem; i++) {
auto &chain_entry = chains[i + offset];
block->put_field_data("chain", chain);
offset += num_elem;
void add_decomp_field(Ioss::Region ®ion, const std::string &decomp_variable_name,
bool add_chain_info)
const auto &blocks = region.get_element_blocks();
for (const auto &block : blocks) {
block->field_add(Ioss::Field(decomp_variable_name, region.field_int_type(), IOSS_SCALAR(),
if (add_chain_info) {
Ioss::Field("chain", region.field_int_type(), "Real[2]", Ioss::Field::TRANSIENT));
template <typename INT>
void output_decomp_field(Ioss::Region ®ion, const std::vector<int> &elem_to_proc,
const Ioss::chain_t<INT> &chains,
const std::string &decomp_variable_name, bool add_chain_info)
auto step = region.add_state(0.0);
output_decomp_map(region, elem_to_proc, chains, decomp_variable_name, add_chain_info);
template <typename INT>
void line_decomp_modify(const Ioss::chain_t<INT> &element_chains,
const std::vector<int> &elem_to_proc, int proc_count, INT dummy);
int case_compare(const char *s1, const char *s2)
const char *c1 = s1;
const char *c2 = s2;
for (;;) {
if (::toupper(*c1) != ::toupper(*c2)) {
return (::toupper(*c1) - ::toupper(*c2));
if (*c1 == '\0') {
return 0;
void exodus_error(int lineno)
std::ostringstream errmsg;
"Exodus error ({}) {} at line {} in file Slice.C. Please report to "
"if you need help.",
exerrval, ex_strerror(exerrval), lineno);
ex_err(nullptr, nullptr, EX_PRTLASTMSG);
throw std::runtime_error(errmsg.str());
template <typename INT>
void populate_proc_node(size_t count, size_t offset, size_t element_nodes,
const std::vector<int> &elem_to_proc, const std::vector<INT> &glob_conn,
std::vector<std::vector<int>> &proc_node,
std::vector<size_t> &on_proc_count)
// Determine which processor(s) each node is present on.
// Also count number of nodes on each processor.
size_t el = 0;
for (size_t j = 0; j < count; j++) {
auto p = elem_to_proc[offset + j];
for (size_t k = 0; k < element_nodes; k++) {
INT node = glob_conn[el++] - 1;
bool exists = std::find(std::begin(proc_node[node]), std::end(proc_node[node]), p) !=
if (!exists) {
void filename_substitution(std::string &filename, const SystemInterface &interFace);
template <typename INT>
void slice(Ioss::Region ®ion, const std::string &nemfile, SystemInterface &interFace,
INT dummy);
template <typename INT> bool is_sequential(const std::vector<INT> &map)
for (size_t i = 0; i < map.size(); i++) {
if (map[i] != i + 1) {
return false;
return true;
int get_common_node_count(const Ioss::Region ®ion)
// Determine number of nodes that elements must share to be
// considered connected. A 8-node hex-only mesh would have 4
// A 3D shell mesh should have 2. Basically, use the minimum
// number of nodes per side for all element blocks... Omit sphere
// elements; ignore bars(?)...
int common_nodes = 999;
const auto &ebs = region.get_element_blocks();
for (const auto &eb : ebs) {
const Ioss::ElementTopology *topology = eb->topology();
const Ioss::ElementTopology *boundary = topology->boundary_type(0);
if (boundary != nullptr) {
common_nodes = std::min(common_nodes, boundary->number_boundaries());
else {
// Different topologies on some element faces...
size_t nb = topology->number_boundaries();
for (size_t bb = 1; bb <= nb; bb++) {
boundary = topology->boundary_type(bb);
if (boundary != nullptr) {
common_nodes = std::min(common_nodes, boundary->number_boundaries());
common_nodes = std::max(1, common_nodes);
fmt::print(stderr, "Setting common_nodes to {}\n", common_nodes);
return common_nodes;
} // namespace
// ========================================================================
int main(int argc, char *argv[])
MPI_Init(&argc, &argv);
double begin = seacas_timer();
Ioss::Init::Initializer io;
SystemInterface interFace;
bool ok = interFace.parse_options(argc, argv);
if (!ok) {
fmt::print(stderr, "\nERROR: Problem parsing command line options.\n\n");
std::string nem_file = interFace.nemesisFile_;
std::string path = interFace.output_path();
if (!path.empty()) {
filename_substitution(path, interFace);
// See if specified path exists.
Ioss::FileInfo output_path(path);
if (!output_path.exists()) {
// Try to create the directory...
else if (!output_path.is_dir()) {
fmt::print(stderr, "ERROR: Path '{}' is not a directory.\n", path);
// See if the nem_file already has a path prepended to the
// filename and if so, extract the basename.
Ioss::FileInfo nemesis(nem_file);
std::string sep = "/";
if (path[path.length() - 1] == '/') {
sep = "";
nem_file = path + sep + nemesis.tailname();
if (interFace.outputDecompMap_ || interFace.outputDecompField_) {
// Then not creating split files, just adding map or field to a single output file
// Need to check that not overwriting input file...
if (interFace.inputFile_ == nem_file) {
nem_file += "-decomp";
fmt::print(stderr, "\nInput: '{}'\n", interFace.inputFile_);
fmt::print(stderr, "Output: '{}'\n", nem_file);
debug_level = interFace.debug();
partial_count = interFace.partial();
// INPUT ...
// NOTE: The "READ_RESTART" mode ensures that the node and element ids will be mapped.
Ioss::DatabaseIO *dbi =
Ioss::IOFactory::create(interFace.inputFormat_, interFace.inputFile_, Ioss::READ_RESTART,
if (dbi == nullptr || !dbi->ok(true)) {
if (interFace.ints64Bit_) {
// NOTE: 'region' owns 'db' pointer at this time...
Ioss::Region region(dbi, "region_1");
region.output_summary(std::cerr, true);
try {
if (dbi->int_byte_size_api() == 4) {
progress("4-byte slice");
slice(region, nem_file, interFace, 1);
else {
progress("8-byte slice");
slice(region, nem_file, interFace, static_cast<int64_t>(1));
catch (std::exception &e) {
fmt::print(stderr, "\n{}\n\nSlice terminated due to exception\n", e.what());
fmt::print(stderr, "\nHigh-Water Memory Use: {} bytes\n",
fmt::print(stderr, "Total execution time = {:.5}\n", seacas_timer() - begin);
fmt::print(stderr, "\nSlice execution successful.\n");
namespace {
template <typename INT>
void create_adjacency_list(const Ioss::Region ®ion, std::vector<idx_t> &pointer,
std::vector<idx_t> &adjacency, INT)
// Size of pointer list is element count + 1;
// Size of adjacency list is sum of nodes-per-element for each element.
size_t sum = 0;
size_t count = 0;
const auto &ebs = region.get_element_blocks();
for (const auto &eb : ebs) {
size_t element_count = eb->entity_count();
size_t element_nodes = eb->topology()->number_nodes();
sum += element_count * element_nodes;
count += element_count;
pointer.reserve(count + 1);
fmt::print(stderr, "\tAdjacency Size = {} for {} elements.\n", fmt::group_digits(sum),
// Now, iterate the blocks again, get connectivity and build adjacency structure.
std::vector<INT> connectivity;
for (const auto &eb : ebs) {
eb->get_field_data("connectivity_raw", connectivity);
size_t element_count = eb->entity_count();
size_t element_nodes = eb->topology()->number_nodes();
size_t el = 0;
for (size_t j = 0; j < element_count; j++) {
for (size_t k = 0; k < element_nodes; k++) {
INT node = connectivity[el++] - 1;
assert(pointer.size() == count + 1);
assert(adjacency.size() == sum);
template <typename INT>
void decompose_elements(const Ioss::Region ®ion, SystemInterface &interFace,
std::vector<int> &elem_to_proc, IOSS_MAYBE_UNUSED INT dummy)
// Populate the 'elem_to_proc' vector with a mapping from element to processor.
size_t element_count = region.get_property("element_count").get_int();
size_t elem_per_proc = element_count / interFace.processor_count();
size_t extra = element_count % interFace.processor_count();
fmt::print(stderr, "\nDecomposing {} elements across {} processors using method '{}'.\n",
fmt::group_digits(element_count), fmt::group_digits(interFace.processor_count()),
if (interFace.lineDecomp_) {
fmt::print(stderr, "\tDecomposition will be modified to put element lines/chains/columns on "
"same processor rank\n");
if (interFace.outputDecompMap_) {
fmt::print(stderr, "\tDecomposition will be output to an element map named '{}'.\n",
if (interFace.outputDecompField_) {
fmt::print(stderr, "\tDecomposition will be output to an element field named '{}'.\n",
fmt::print(stderr, "\n");
if (interFace.decomposition_method() == "linear") {
size_t elem_beg = 0;
for (size_t proc = 0; proc < interFace.processor_count(); proc++) {
size_t add = (proc < extra) ? 1 : 0;
size_t elem_end = elem_beg + elem_per_proc + add;
for (size_t elem = elem_beg; elem < elem_end; elem++) {
elem_beg = elem_end;
else if (interFace.decomposition_method() == "scattered") {
// Scattered...
size_t proc = 0;
for (size_t elem = 0; elem < element_count; elem++) {
if (proc >= interFace.processor_count()) {
proc = 0;
else if (interFace.decomposition_method() == "rb" ||
interFace.decomposition_method() == "kway") {
std::vector<idx_t> pointer;
std::vector<idx_t> adjacency;
double start = seacas_timer();
create_adjacency_list(region, pointer, adjacency, dummy);
double end = seacas_timer();
fmt::print(stderr, "\tCreate Adjacency List = {:.5}\n", end - start);
// Call Metis to get the partition...
start = seacas_timer();
idx_t elem_count = element_count;
idx_t common = get_common_node_count(region);
idx_t proc_count = interFace.processor_count();
idx_t obj_val = 0;
std::vector<idx_t> options((METIS_NOPTIONS));
if (interFace.decomposition_method() == "kway") {
else {
if (interFace.contiguous_decomposition()) {
idx_t node_count = region.get_property("node_count").get_int();
std::vector<idx_t> node_partition(node_count);
std::vector<idx_t> elem_partition(element_count);
fmt::print(stderr, "\tCalling METIS Decomposition routine.\n");
METIS_PartMeshDual(&elem_count, &node_count, &pointer[0], &adjacency[0], nullptr, nullptr,
&common, &proc_count, nullptr, &options[0], &obj_val, &elem_partition[0],
std::copy(elem_partition.begin(), elem_partition.end(), std::back_inserter(elem_to_proc));
end = seacas_timer();
fmt::print(stderr, "\tMETIS Partition = {:.5}\n", end - start);
fmt::print(stderr, "Objective value = {}\n", obj_val);
// TODO Check Error...
fmt::print(stderr, "ERROR: Metis library not enabled in this version of slice.\n"
" The 'rb' and 'kway' methods are not available.\n\n");
else if (interFace.decomposition_method() == "random") {
// Random... Use scattered method and then random_shuffle() the vector.
// Ensures that each processor has correct number of elements, but
// they are randomly distributed.
size_t proc = 0;
for (size_t elem = 0; elem < element_count; elem++) {
if (proc >= interFace.processor_count()) {
proc = 0;
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(elem_to_proc.begin(), elem_to_proc.end(), g);
else if (interFace.decomposition_method() == "variable") {
const std::string &elem_variable = interFace.decomposition_variable();
if (elem_variable.empty()) {
fmt::print(stderr, "\nERROR: No element decomposition variable specified.\n");
// Get all element blocks and cycle through each reading the
// values for the processor...
const auto &blocks = region.get_element_blocks();
auto c_region = (Ioss::Region *)(®ion);
for (const auto &block : blocks) {
if (!block->field_exists(elem_variable)) {
fmt::print(stderr, "\nERROR: Element variable '{}' does not exist on block {}.\n",
elem_variable, block->name());
std::vector<double> tmp_vals;
block->get_field_data(elem_variable, tmp_vals);
auto block_count = block->entity_count();
for (int64_t i = 0; i < block_count; i++) {
else if (interFace.decomposition_method() == "map") {
std::string map_name = interFace.decomposition_variable();
if (map_name.empty()) {
fmt::print(stderr, "\nERROR: No element decomposition map specified.\n");
// If the "map_name" string contains a comma, then the value
// following the comma is either an integer "scale" which is
// divided into each entry in `elem_to_proc`, or it is the
// string "auto" which will automatically scale all values by
// the *integer* "max/processorCount"
// NOTE: integer division with *no* rounding is used.
int iscale = 1;
auto pos = map_name.find(",");
if (pos != std::string::npos) {
// Extract the string following the comma...
auto scale = map_name.substr(pos + 1);
if (scale == "AUTO" || scale == "auto") {
iscale = 0;
else {
iscale = std::stoi(scale);
map_name = map_name.substr(0, pos);
Ioss::DatabaseIO *db = region.get_database();
int exoid = db->get_file_pointer();
bool map_read = false;
int map_count = ex_inquire_int(exoid, EX_INQ_ELEM_MAP);
if (map_count > 0) {
int max_name_length = ex_inquire_int(exoid, EX_INQ_DB_MAX_USED_NAME_LENGTH);
max_name_length = max_name_length < 32 ? 32 : max_name_length;
char **names = Ioss::Utils::get_name_array(map_count, max_name_length);
int error = ex_get_names(exoid, EX_ELEM_MAP, names);
if (error < 0) {
for (int i = 0; i < map_count; i++) {
if (case_compare(names[i], map_name.c_str()) == 0) {
error = ex_get_num_map(exoid, EX_ELEM_MAP, i + 1,;
if (error < 0) {
map_read = true;
Ioss::Utils::delete_name_array(names, map_count);
if (!map_read) {
fmt::print(stderr, "\nERROR: Element decomposition map '{}' could not be read from file.\n",
// Do the scaling (integer division...)
if (iscale == 0) {
// Auto scaling was asked for. Determine max entry in `elem_to_proc` and
// set the scale factor.
auto max_proc = *std::max_element(elem_to_proc.begin(), elem_to_proc.end());
iscale = (max_proc + 1) / interFace.processor_count();
fmt::print(" Element Processor Map automatic scaling factor = {}\n", iscale);
if (iscale == 0) {
"ERROR: Max value in element processor map is {} which is\n"
"\tless than the processor count ({}). Scaling values is not possible.",
max_proc, interFace.processor_count());
std::transform(elem_to_proc.begin(), elem_to_proc.end(), elem_to_proc.begin(),
[iscale](int p) { return p / iscale; });
else if (interFace.decomposition_method() == "file") {
// Read the element decomposition mapping from a file. The
// syntax of the file is an optional element count followed by
// the processor for this range. If the element range is
// omitted, then the processor applies to the next element in
// the sequence. All elements must be specified or an error will
// be raised.
// Example:
// 0
// 100 1
// 0
// Will assign:
// * element 1 to processor 0;
// * followed by the next 100 elements (2 to 101) to processor 1;
// * followed by the next element (102) to processor 0.
// The resulting decomposition will have 2 elements (1, 102) on
// processor 0 and 100 elements (2..101) on processor 1.
const std::string &filename = interFace.decomposition_file();
if (filename.empty()) {
fmt::print(stderr, "\nERROR: No element decomposition file specified.\n");
std::ifstream decomp_file(filename, std::ios::in);
if (!decomp_file.good()) {
"\nERROR: Element decomposition file '{}' does not exist or could not be opened.\n",
std::string line;
size_t line_num = 0;
while (std::getline(decomp_file, line)) {
// See if 1 or 2 tokens on line...
std::vector<std::string> tokens;
tokens = SLIB::tokenize(line, ", \t");
size_t proc = 0;
size_t count = 1;
if (tokens.empty()) {
else if (tokens.size() == 1) {
// Just a processor specification for the next element...
proc = std::stoi(tokens[0]);
else {
// Count and processor specified.
count = std::stoi(tokens[0]);
proc = std::stoi(tokens[1]);
if (proc > interFace.processor_count()) {
"\nERROR: Invalid processor {} specified on line {} of decomposition file.\n"
"\tValid range is 0..{}\n",
fmt::group_digits(proc), fmt::group_digits(line_num),
fmt::group_digits(interFace.processor_count() - 1));
if (elem_to_proc.size() + count > element_count) {
"\nERROR: The processor specification on line {}"
" of the decomposition file results in too many elements being specified.\n"
"\tThe total number of elements in the model is {}\n"
"\tPrior to this line, {} elements were specified.\n"
"\tIncluding this line, {} elements will be specified.\n",
fmt::group_digits(line_num), fmt::group_digits(element_count),
fmt::group_digits(elem_to_proc.size() + count));
for (size_t i = 0; i < count; i++) {
assert(elem_to_proc.size() == element_count);
template <typename INT>
void line_decomp_modify(const Ioss::chain_t<INT> &element_chains, std::vector<int> &elem_to_proc,
int proc_count, INT /* dummy */)
// Get a map of all chains and the elements in the chains. Map key will be root.
std::map<INT, std::vector<INT>> chains;
for (size_t i = 0; i < element_chains.size(); i++) {
auto &chain_entry = element_chains[i];
chains[chain_entry.element].push_back(i + 1);
if ((debug_level & 16) != 0) {
fmt::print("[{}]: element {}, link {}, processor {}\n", i + 1, chain_entry.element,
|, elem_to_proc[i]);
// Delta: elements added/removed from each processor...
std::vector<int> delta(proc_count);
// Now, for each chain...
for (auto &chain : chains) {
if ((debug_level & 16) != 0) {
fmt::print("Chain Root: {} contains: {}\n", chain.first, fmt::join(chain.second, ", "));
std::vector<INT> chain_proc_count(proc_count);
const auto &chain_elements = chain.second;
// * get processors used by elements in the chain...
for (const auto &element : chain_elements) {
auto proc = elem_to_proc[element - 1];
// * Now, subtract the `delta` from each count
for (int i = 0; i < proc_count; i++) {
chain_proc_count[i] -= delta[i];
// * Find the maximum value in `chain_proc_count`
auto max = std::max_element(chain_proc_count.begin(), chain_proc_count.end());
auto max_proc = std::distance(chain_proc_count.begin(), max);
// * Assign all elements in the chain to `max_proc`.
// * Update the deltas for all processors that gain/lose elements...
for (const auto &element : chain_elements) {
if (elem_to_proc[element - 1] != max_proc) {
auto old_proc = elem_to_proc[element - 1];
elem_to_proc[element - 1] = max_proc;
std::vector<INT> proc_element_count(proc_count);
for (auto proc : elem_to_proc) {
if ((debug_level & 16) != 0) {
fmt::print("\nElements/Processor: {}\n", fmt::join(proc_element_count, ", "));
fmt::print("Delta/Processor: {}\n", fmt::join(delta, ", "));
template <typename INT>
void free_connectivity_storage(std::vector<std::vector<std::vector<INT>>> &connectivity,
size_t proc_begin, size_t proc_size)
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
size_t block_count = connectivity[p].size();
for (size_t b = 0; b < block_count; b++) {
size_t processor_count = connectivity.size();
if (proc_begin + proc_size == processor_count) {
template <typename INT>
void get_sidesets(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &elem_to_proc, INT /*dummy*/)
// This routine reads the sidesets in the global database;
// and defines corresponding sidesets on each processor...
size_t proc_count = proc_region.size();
auto &ss = region.get_sidesets();
size_t set_count = ss.size();
for (size_t s = 0; s < set_count; s++) {
auto *gss = ss[s];
auto &ss_name = gss->name();
std::vector<Ioss::SideSet *> sset(proc_count);
for (size_t p = 0; p < proc_count; p++) {
sset[p] = new Ioss::SideSet(proc_region[p]->get_database(), ss_name);
auto &side_blocks = gss->get_side_blocks();
for (auto &gsb : side_blocks) {
std::vector<INT> ss_elems;
gsb->get_field_data("element_side_raw", ss_elems);
std::vector<INT> pss(proc_count);
for (size_t i = 0; i < ss_elems.size(); i += 2 /* elem,side pairs */) {
int64_t elem = ss_elems[i] - 1;
int p = elem_to_proc[elem];
auto &name = gsb->name();
auto &side_type = gsb->topology()->name();
auto &elem_type = gsb->parent_element_topology()->name();
for (size_t p = 0; p < proc_count; p++) {
auto *side_block = new Ioss::SideBlock(proc_region[p]->get_database(), name, side_type,
elem_type, pss[p]);
template <typename INT>
void output_sidesets(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &elem_to_proc, size_t proc_begin, size_t proc_size,
INT /*dummy*/)
// This routine reads the sidesets in the global database;
// and outputs the sidesets on each processor...
size_t proc_count = proc_region.size();
auto &ss = region.get_sidesets();
size_t set_count = ss.size();
for (size_t s = 0; s < set_count; s++) {
if (debug_level & 4) {
progress("\tSideset " + std::to_string(s + 1));
Ioss::SideSet *gss = ss[s];
auto &ss_name = gss->name();
std::vector<Ioss::SideSet *> proc_ss(proc_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
proc_ss[p] = proc_region[p]->get_sideset(ss_name);
auto &side_blocks = gss->get_side_blocks();
for (auto &gsb : side_blocks) {
auto &sb_name = gsb->name();
std::vector<Ioss::SideBlock *> proc_sb(proc_count);
std::vector<std::vector<INT>> psb_elems(proc_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
proc_sb[p] = proc_ss[p]->get_side_block(sb_name);
size_t elem_count = proc_sb[p]->entity_count();
psb_elems[p].reserve(elem_count * 2);
std::vector<INT> ss_elems;
gsb->get_field_data("element_side_raw", ss_elems);
for (size_t i = 0; i < ss_elems.size(); i += 2 /* elem,side pairs */) {
int64_t elem = ss_elems[i] - 1;
int p = elem_to_proc[elem];
psb_elems[p].push_back(elem + 1);
psb_elems[p].push_back(ss_elems[i + 1]);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
Ioss::SideBlock *psb = proc_sb[p];
psb->put_field_data("element_side", psb_elems[p]);
proc_progress(p, proc_count);
if (set_count > 0) {
static bool output = false;
if (!output) {
fmt::print(stderr, "WARNING: Sideset distribution factors not yet handled correctly.\n");
output = true;
template <typename INT>
void output_communication_map(const Ioss::Region &global_region,
std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer, size_t proc_begin,
size_t proc_size)
std::vector<std::vector<INT>> border_node_proc_map(proc_size);
INT global_node_count = global_region.get_property("node_count").get_int();
// Iterate all nodes and count the number of processors it is on:
for (INT i = 0; i < global_node_count; i++) {
size_t node_proc_count = node_to_proc_pointer[i + 1] - node_to_proc_pointer[i];
if (node_proc_count > 1) {
// Get the <node,proc> pairs for all border nodes on this processor...
// Not efficient at this time...
size_t beg = node_to_proc_pointer[i];
size_t end = node_to_proc_pointer[i + 1];
for (size_t j = beg; j < end; j++) {
size_t node = i + 1;
size_t proc = node_to_proc[j];
for (size_t k = beg; k < end; k++) {
if (j == k) {
size_t p = node_to_proc[k];
if (p >= proc_begin && p < proc_begin + proc_size) {
border_node_proc_map[p - proc_begin].push_back(node);
border_node_proc_map[p - proc_begin].push_back(proc);
progress("border_node_proc_map fully populated");
size_t proc_count = proc_region.size();
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
auto &commset = proc_region[p]->get_commsets()[0];
commset->put_field_data("entity_processor", border_node_proc_map[p - proc_begin]);
border_node_proc_map[p - proc_begin].clear();
proc_progress(p, proc_count);
template <typename INT>
void define_communication_data(const Ioss::Region &global_region,
std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer)
// This routine categorizes the nodes on a processor as interior
// or border.
// TODO(gdsjaar): Categorize elements also. For now, all treated as
// interior which works for sierra-based applications
// The node_to_proc_pointer has information about the number of
// processors that a node is shared with. If the count is 1, then
// the node is interior; otherwise, it is border.
// Allocates:
// * interior_nodes INT size - #interior nodes
// * border_nodes_proc_map INT size - (proc-node) pair for each border node
INT global_node_count = global_region.get_property("node_count").get_int();
size_t proc_count = proc_region.size();
std::vector<INT> interior_nodes(proc_count);
std::vector<INT> border_nodes(proc_count);
// Iterate all nodes and count the number of processors it is on:
for (INT i = 0; i < global_node_count; i++) {
size_t node_proc_count = node_to_proc_pointer[i + 1] - node_to_proc_pointer[i];
if (node_proc_count == 1) {
size_t proc = node_to_proc[node_to_proc_pointer[i]];
else {
// Get the <node,proc> pairs for all border nodes on this processor...
// Not efficient at this time...
size_t beg = node_to_proc_pointer[i];
size_t end = node_to_proc_pointer[i + 1];
for (size_t j = beg; j < end; j++) {
for (size_t k = beg; k < end; k++) {
if (j == k) {
size_t p = node_to_proc[k];
INT global_element_count = global_region.get_property("element_count").get_int();
// Categorize each element as interior...
// Categorize the remaining nodes as border...
for (size_t p = 0; p < proc_count; p++) {
Ioss::Region *region = proc_region[p];
INT element_count = region->get_property("element_count").get_int();
INT node_count = region->get_property("node_count").get_int();
INT border_node_cnt = node_count - interior_nodes[p];
region->property_add(Ioss::Property("global_node_count", global_node_count));
region->property_add(Ioss::Property("global_element_count", global_element_count));
region->property_add(Ioss::Property("processor_count", static_cast<int>(proc_count)));
region->property_add(Ioss::Property("my_processor", static_cast<int>(p)));
region->property_add(Ioss::Property("internal_node_count", interior_nodes[p]));
region->property_add(Ioss::Property("border_node_count", border_node_cnt));
region->property_add(Ioss::Property("internal_element_count", element_count));
region->property_add(Ioss::Property("border_element_count", 0));
// Add commset data... The length of the commset is the number
// of <node,proc> pairs for all border nodes.
// For each node on this processor that isn't an interior node,
// create the <node,proc> pair...
auto *commset =
new Ioss::CommSet(region->get_database(), "commset_node", "node", border_nodes[p]);
commset->property_add(Ioss::Property("id", 1));
if (debug_level & 2) {
fmt::print(stderr, "Commset for processor {} has {} entries.\n", p, border_nodes[p]);
template <typename INT>
void get_nodesets(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer)
// This routine reads the nodesets in the global database;
// and defines corresponding nodesets on each processor...
size_t proc_count = proc_region.size();
auto &ns = region.get_nodesets();
size_t set_count = ns.size();
for (size_t s = 0; s < set_count; s++) {
std::vector<INT> pns(proc_count);
Ioss::NodeSet *gns = ns[s];
std::vector<INT> ns_nodes;
gns->get_field_data("ids_raw", ns_nodes);
for (size_t i = 0; i < ns_nodes.size(); i++) {
int64_t node = ns_nodes[i] - 1;
size_t p_beg = node_to_proc_pointer[node];
size_t p_end = node_to_proc_pointer[node + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
auto &name = ns[s]->name();
if (debug_level & 2) {
fmt::print(stderr, "\tNodeset {}--", name);
for (size_t p = 0; p < proc_count; p++) {
auto *node_set = new Ioss::NodeSet(proc_region[p]->get_database(), name, pns[p]);
if (debug_level & 2) {
fmt::print(stderr, "{}:{}, ", p, pns[p]);
if (debug_level & 2) {
fmt::print(stderr, "\n");
template <typename INT>
void output_nodesets(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer, size_t proc_begin,
size_t proc_size)
// This routine reads the nodesets in the global database;
// and defines corresponding nodesets on each processor...
size_t proc_count = proc_region.size();
auto &ns = region.get_nodesets();
size_t set_count = ns.size();
for (size_t s = 0; s < set_count; s++) {
if (debug_level & 4) {
progress("\tNodeSet " + std::to_string(s + 1));
Ioss::NodeSet *gns = ns[s];
std::vector<INT> ns_nodes;
gns->get_field_data("ids_raw", ns_nodes);
std::vector<double> ns_df;
gns->get_field_data("distribution_factors", ns_df);
std::vector<std::vector<INT>> pns_nodes(proc_count);
std::vector<std::vector<double>> pns_df(proc_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
size_t node_count = proc_region[p]->get_nodesets()[s]->entity_count();
for (size_t i = 0; i < ns_nodes.size(); i++) {
int64_t node = ns_nodes[i] - 1;
size_t p_beg = node_to_proc_pointer[node];
size_t p_end = node_to_proc_pointer[node + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
pns_nodes[p].push_back(node + 1);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
Ioss::NodeSet *proc_ns = proc_region[p]->get_nodesets()[s];
proc_ns->put_field_data("ids", pns_nodes[p]);
proc_ns->put_field_data("distribution_factors", pns_df[p]);
proc_progress(p, proc_count);
template <typename INT>
void output_node_map(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer, size_t proc_begin,
size_t proc_size)
// This is the processor-local to global-implicit node map...
// This maps the 1..#node in the global mesh to each processor...
size_t node_count = region.get_property("node_count").get_int();
size_t proc_count = proc_region.size();
std::vector<std::vector<INT>> proc_map(proc_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
size_t pnode_count = proc_region[p]->get_property("node_count").get_int();
for (size_t i = 0; i < node_count; i++) {
size_t p_beg = node_to_proc_pointer[i];
size_t p_end = node_to_proc_pointer[i + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
if (p >= proc_begin && p < proc_begin + proc_size) {
proc_map[p].push_back(i + 1);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
Ioss::NodeBlock *nb = proc_region[p]->get_node_blocks()[0];
nb->put_field_data("ids", proc_map[p]);
proc_progress(p, proc_count);
template <typename INT>
void output_global_node_map(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer, size_t proc_begin,
size_t proc_size)
// This is the processor-local to global-implicit node map...
// This maps the node_number map (if it exists) in the global mesh
// to each processor...
std::vector<INT> ids;
Ioss::NodeBlock *gnb = region.get_node_blocks()[0];
gnb->get_field_data("ids", ids);
// Check whether the map is sequential (X maps to X);
bool sequential = is_sequential(ids);
if (!sequential) {
fmt::print(stderr, "Node map is not sequential...\n");
size_t node_count = region.get_property("node_count").get_int();
size_t proc_count = proc_region.size();
std::vector<std::vector<INT>> proc_map(proc_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
size_t pnode_count = proc_region[p]->get_property("node_count").get_int();
for (size_t i = 0; i < node_count; i++) {
size_t p_beg = node_to_proc_pointer[i];
size_t p_end = node_to_proc_pointer[i + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
if (p >= proc_begin && p < proc_begin + proc_size) {
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
Ioss::NodeBlock *nb = proc_region[p]->get_node_blocks()[0];
nb->put_field_data("ids", proc_map[p]);
proc_progress(p, proc_count);
template <typename INT>
void output_element_map(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &elem_to_proc, size_t proc_begin, size_t proc_size,
INT /* dummy */)
// map[p][b] = map for block b on processor p
size_t proc_count = proc_region.size();
const auto &ebs = region.get_element_blocks();
size_t block_count = ebs.size();
size_t offset = 0;
for (size_t b = 0; b < block_count; b++) {
if (debug_level & 4) {
progress("\tBlock " + std::to_string(b + 1));
#if 0
std::vector<INT> ids;
ebs[b]->get_field_data("ids", ids);
std::vector<std::vector<INT>> map(proc_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
const auto &proc_ebs = proc_region[p]->get_element_blocks();
size_t proc_element_count = proc_ebs[b]->entity_count();
size_t element_count = ebs[b]->entity_count();
for (size_t j = 0; j < element_count; j++) {
size_t p = elem_to_proc[offset + j];
if (p >= proc_begin && p < proc_begin + proc_size) {
#if 0
map[p].push_back(offset + j + 1);
offset += element_count;
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
const auto &proc_ebs = proc_region[p]->get_element_blocks();
proc_ebs[b]->put_field_data("ids", map[p]);
proc_progress(p, proc_count);
template <typename INT>
void output_coordinates(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer, size_t proc_begin,
size_t proc_size)
std::vector<double> glob_coord_x;
std::vector<double> glob_coord_y;
std::vector<double> glob_coord_z;
Ioss::NodeBlock *gnb = region.get_node_blocks()[0];
// Distribute nodal coordinates to each processor...
// coordinates[p][i] = x,y,z coordinates on processor p
size_t processor_count = proc_region.size();
std::vector<std::vector<double>> coordinates_x(processor_count);
std::vector<std::vector<double>> coordinates_y(processor_count);
std::vector<std::vector<double>> coordinates_z(processor_count);
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
size_t pnode_count = proc_region[p]->get_property("node_count").get_int();
progress("\tReserve processor coordinate vectors");
Ioss::DatabaseIO *db = region.get_database();
size_t node_count = region.get_property("node_count").get_int();
if (node_count > partial_count) {
int exoid = db->get_file_pointer();
for (size_t beg = 1; beg <= node_count; beg += partial_count) {
size_t count = partial_count;
if (beg + count - 1 > node_count) {
count = node_count - beg + 1;
ex_get_partial_coord(exoid, beg, count,,,
progress("\tpartial_coord: " + std::to_string(beg) + " " + std::to_string(count));
for (size_t i = 0; i < count; i++) {
size_t ii = beg + i - 1;
size_t p_beg = node_to_proc_pointer[ii];
size_t p_end = node_to_proc_pointer[ii + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
if (p >= proc_begin && p < proc_begin + proc_size) {
else {
gnb->get_field_data("mesh_model_coordinates_x", glob_coord_x);
gnb->get_field_data("mesh_model_coordinates_y", glob_coord_y);
gnb->get_field_data("mesh_model_coordinates_z", glob_coord_z);
progress("\tRead global mesh_model_coordinates");
for (size_t i = 0; i < node_count; i++) {
size_t p_beg = node_to_proc_pointer[i];
size_t p_end = node_to_proc_pointer[i + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
if (p >= proc_begin && p < proc_begin + proc_size) {
progress("\tPopulate processor coordinate vectors");
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
Ioss::NodeBlock *nb = proc_region[p]->get_node_blocks()[0];
nb->put_field_data("mesh_model_coordinates_x", coordinates_x[p]);
nb->put_field_data("mesh_model_coordinates_y", coordinates_y[p]);
nb->put_field_data("mesh_model_coordinates_z", coordinates_z[p]);
proc_progress(p, processor_count);
progress("\tOutput processor coordinate vectors");
// Output a component at a time...
template <typename INT>
void output_coordinates_c(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &node_to_proc,
const std::vector<INT> &node_to_proc_pointer, size_t proc_begin,
size_t proc_size)
std::vector<double> glob_coord;
Ioss::NodeBlock *gnb = region.get_node_blocks()[0];
std::array<std::string, 3> field_name{"mesh_model_coordinates_x", "mesh_model_coordinates_y",
// Distribute nodal coordinates to each processor...
// coordinates[p][i] = x,y,z coordinates on processor p
size_t processor_count = proc_region.size();
std::vector<std::vector<double>> coordinates(processor_count);
Ioss::DatabaseIO *db = region.get_database();
size_t node_count = region.get_property("node_count").get_int();
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
size_t pnode_count = proc_region[p]->get_property("node_count").get_int();
progress("\tReserve processor coordinate vectors");
for (size_t comp = 0; comp < 3; comp++) {
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
if (node_count > partial_count) {
int exoid = db->get_file_pointer();
for (size_t beg = 1; beg <= node_count; beg += partial_count) {
size_t count = partial_count;
if (beg + count - 1 > node_count) {
count = node_count - beg + 1;
switch (comp) {
case 0:
ex_get_partial_coord(exoid, beg, count,, nullptr, nullptr);
case 1:
ex_get_partial_coord(exoid, beg, count, nullptr,, nullptr);
case 2:
ex_get_partial_coord(exoid, beg, count, nullptr, nullptr,;
progress("\tpartial_coord: " + std::to_string(beg) + " " + std::to_string(count));
for (size_t i = 0; i < count; i++) {
size_t ii = beg + i - 1;
size_t p_beg = node_to_proc_pointer[ii];
size_t p_end = node_to_proc_pointer[ii + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
if (p >= proc_begin && p < proc_begin + proc_size) {
else {
gnb->get_field_data(field_name[comp], glob_coord);
progress("\tRead global mesh_model_coordinates");
for (size_t i = 0; i < node_count; i++) {
size_t p_beg = node_to_proc_pointer[i];
size_t p_end = node_to_proc_pointer[i + 1];
for (size_t j = p_beg; j < p_end; j++) {
size_t p = node_to_proc[j];
if (p >= proc_begin && p < proc_begin + proc_size) {
progress("\tPopulate processor coordinate vectors");
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
Ioss::NodeBlock *nb = proc_region[p]->get_node_blocks()[0];
nb->put_field_data(field_name[comp], coordinates[p]);
proc_progress(p, processor_count);
progress("\tOutput processor coordinate vectors");
template <typename INT>
void output_connectivity(const Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &elem_to_proc, size_t proc_begin,
size_t proc_size, INT /*dummy*/)
// Read connectivity and partition to each processor/block.
// connectvity[p][b] = connectivity for block b on processor p
const auto &ebs = region.get_element_blocks();
size_t block_count = ebs.size();
size_t processor_count = proc_region.size();
Ioss::DatabaseIO *db = region.get_database();
std::vector<INT> glob_conn;
size_t offset = 0;
for (size_t b = 0; b < block_count; b++) {
std::vector<std::vector<INT>> connectivity(processor_count);
size_t element_count = ebs[b]->entity_count();
size_t element_nodes = ebs[b]->topology()->number_nodes();
size_t block_id = ebs[b]->get_property("id").get_int();
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
const auto &pebs = proc_region[p]->get_element_blocks();
size_t pelement_count = pebs[b]->entity_count();
size_t pelement_nodes = pebs[b]->topology()->number_nodes();
connectivity[p].reserve(pelement_count * pelement_nodes); // Use reserve, not resize
// Do a 'partial_count' elements at a time...
if (element_count >= partial_count) {
int exoid = db->get_file_pointer();
glob_conn.resize(partial_count * element_nodes);
for (size_t beg = 1; beg <= element_count; beg += partial_count) {
size_t count = partial_count;
if (beg + count - 1 > element_count) {
count = element_count - beg + 1;
ex_get_partial_conn(exoid, EX_ELEM_BLOCK, block_id, beg, count,, nullptr,
progress(fmt::format("\tpartial_conn-- start: {}\tcount: {}", fmt::group_digits(beg),
size_t el = 0;
for (size_t j = 0; j < count; j++) {
size_t p = elem_to_proc[offset + j];
if (p >= proc_begin && p < proc_begin + proc_size) {
for (size_t k = 0; k < element_nodes; k++) {
else {
el += element_nodes;
offset += count;
else {
ebs[b]->get_field_data("connectivity_raw", glob_conn);
size_t el = 0;
for (size_t j = 0; j < element_count; j++) {
size_t p = elem_to_proc[offset + j];
if (p >= proc_begin && p < proc_begin + proc_size) {
for (size_t k = 0; k < element_nodes; k++) {
else {
el += element_nodes;
offset += element_count;
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
const auto &proc_ebs = proc_region[p]->get_element_blocks();
Ioss::ElementBlock *eb = proc_ebs[b];
eb->put_field_data("connectivity", connectivity[p]);
template <typename INT>
void get_proc_elem_block_count(const Ioss::Region ®ion, std::vector<int> &elem_to_proc,
std::vector<std::vector<INT>> &proc_elem_block_cnt)
const auto &ebs = region.get_element_blocks();
size_t block_count = ebs.size();
size_t begin = 0;
for (size_t i = 0; i < block_count; i++) {
size_t end = begin + ebs[i]->entity_count();
for (size_t j = begin; j < end; j++) {
size_t processor = elem_to_proc[j];
begin = end;
size_t processor_count = proc_elem_block_cnt[0].size();
for (size_t i = 0; i < processor_count; i++) {
size_t sum = 0;
for (size_t j = 0; j < block_count; j++) {
sum += proc_elem_block_cnt[j][i];
proc_elem_block_cnt[block_count][i] = sum;
if (debug_level & 2) {
fmt::print(stderr, "\tProcessor {} has {} elements.\n", fmt::group_digits(i),
template <typename INT>
void get_node_to_proc(Ioss::Region ®ion, std::vector<Ioss::Region *> &proc_region,
const std::vector<int> &elem_to_proc, std::vector<int> &node_to_proc,
std::vector<INT> &node_to_proc_pointer)
// Process each element block connectivity to get the node_to_proc mapping.
// The 'node_to_proc_pointer' vector maps the processor span in the
// node_to_proc vector. The processors that node 'node' (0-based)
// is on are:
// * begin = node_to_proc_pointer[node]
// * end = node_to_proc_pointer[node+1]
// * proc_list = node_to_proc[begin] .. node_to_proc[end-1]
size_t proc_count = proc_region.size();
size_t node_count = region.get_property("node_count").get_int();
if (node_count == 0) {
// Should never happen, but makes static analyzers happy...
std::vector<std::vector<int>> proc_node(node_count);
// Assume that the majority of nodes will be on 2 or less
// processors (hopefully, most are on 1).
// Preallocate the proc_node[node] vector to 2 to minimize
// resizes... Use 'reserve' instead of 'resize'
for (size_t i = 0; i < node_count; i++) {
progress("\tProc_node reserved");
IOSS_MAYBE_UNUSED size_t sum_on_proc_count = 0;
Ioss::DatabaseIO *db = region.get_database();
const auto &ebs = region.get_element_blocks();
size_t block_count = ebs.size();
size_t offset = 0;
std::vector<size_t> on_proc_count(proc_count);
for (size_t b = 0; b < block_count; b++) {
std::vector<INT> glob_conn;
size_t element_count = ebs[b]->entity_count();
size_t element_nodes = ebs[b]->topology()->number_nodes();
size_t block_id = ebs[b]->get_property("id").get_int();
// Do a 'partial_count' elements at a time...
if (element_count >= partial_count) {
int exoid = db->get_file_pointer();
glob_conn.resize(partial_count * element_nodes);
for (size_t beg = 1; beg <= element_count; beg += partial_count) {
size_t count = partial_count;
if (beg + count - 1 > element_count) {
count = element_count - beg + 1;
ex_get_partial_conn(exoid, EX_ELEM_BLOCK, block_id, beg, count,, nullptr,
progress(fmt::format("\tpartial_conn-- start: {}\tcount: {}", fmt::group_digits(beg),
populate_proc_node(count, offset, element_nodes, elem_to_proc, glob_conn, proc_node,
offset += count;
else {
ebs[b]->get_field_data("connectivity_raw", glob_conn);
populate_proc_node(element_count, offset, element_nodes, elem_to_proc, glob_conn, proc_node,
offset += element_count;
for (size_t p = 0; p < proc_count; p++) {
Ioss::NodeBlock *nb =
new Ioss::NodeBlock(proc_region[p]->get_database(), "node_block1", on_proc_count[p], 3);
if (debug_level & 2) {
fmt::print(stderr, "\tProcessor {} has {} nodes.\n", fmt::group_digits(p),
sum_on_proc_count += on_proc_count[p];
progress("\tProc_node populated");
// Have data for each node showing which processors it is on...
// proc_node[node].size() is number of processors for this node...
node_to_proc_pointer.reserve(node_count + 1);
std::vector<size_t> proc_histo(17);
size_t node_to_proc_pointer_size = 0;
for (size_t i = 0; i < node_count; i++) {
size_t num_procs = proc_node[i].size();
if (num_procs == 0) {
fmt::print(stderr, "WARNING: Node {} is not connected to any elements.\n",
fmt::group_digits(i + 1));
else if (num_procs < proc_histo.size()) {
else {
node_to_proc_pointer_size += num_procs;
// Output histogram..
fmt::print(stderr, "Processor count per node histogram:\n");
for (size_t i = 1; i < proc_histo.size(); i++) {
if (proc_histo[i] > 0) {
fmt::print(stderr, "\tNodes on {:2} processors = {:12}\t({:2})%\n", fmt::group_digits(i),
(proc_histo[i] * 100 + node_count / 2) / node_count);
if (proc_histo[0] > 0) {
fmt::print(stderr, "\tNodes on {} or more processors = {}\t({:2})%\n",
fmt::group_digits(proc_histo.size()), fmt::group_digits(proc_histo[0]),
(proc_histo[0] * 100 + node_count / 2) / node_count);
fmt::print(stderr, "\n");
progress("\tNode_to_proc reserved");
assert(sum_on_proc_count == node_to_proc_pointer_size);
for (auto &pn : proc_node) {
size_t num_procs = pn.size();
for (size_t p = 0; p < num_procs; p++) {
assert(node_to_proc.size() == node_to_proc_pointer_size);
progress("\tNode_to_proc populated");
template <typename INT>
void slice(Ioss::Region ®ion, const std::string &nemfile, SystemInterface &interFace,
INT dummy)
bool create_split_files = !interFace.outputDecompMap_ && !interFace.outputDecompField_;
std::vector<Ioss::Region *> proc_region;
if (create_split_files) {
bool ints64 = (sizeof(INT) == 8);
Ioss::PropertyManager properties;
if (interFace.netcdf4_) {
properties.add(Ioss::Property("FILE_TYPE", "netcdf4"));
if (interFace.netcdf5_) {
properties.add(Ioss::Property("FILE_TYPE", "netcdf5"));
if (interFace.compressionLevel_ > 0 || interFace.shuffle_ || interFace.szip_) {
properties.add(Ioss::Property("FILE_TYPE", "netcdf4"));
properties.add(Ioss::Property("COMPRESSION_LEVEL", interFace.compressionLevel_));
properties.add(Ioss::Property("COMPRESSION_SHUFFLE", static_cast<int>(interFace.shuffle_)));
if (interFace.szip_) {
properties.add(Ioss::Property("COMPRESSION_METHOD", "szip"));
else if (interFace.zlib_) {
properties.add(Ioss::Property("COMPRESSION_METHOD", "zlib"));
if (interFace.ints64Bit_) {
properties.add(Ioss::Property("INTEGER_SIZE_DB", 8));
properties.add(Ioss::Property("INTEGER_SIZE_API", 8));
double start = seacas_timer();
std::vector<int> elem_to_proc;
decompose_elements(region, interFace, elem_to_proc, dummy);
double end = seacas_timer();
fmt::print(stderr, "Decompose elements = {:.5}\n", end - start);
Ioss::chain_t<INT> element_chains;
if (interFace.lineDecomp_) {
element_chains =
Ioss::generate_element_chains(region, interFace.lineSurfaceList_, debug_level, dummy);
line_decomp_modify(element_chains, elem_to_proc, interFace.processor_count(), dummy);
if (!create_split_files) {
Ioss::DatabaseIO *dbo = Ioss::IOFactory::create(
"exodus", nemfile, Ioss::WRITE_RESTART, Ioss::ParallelUtils::comm_world(), properties);
if (dbo == nullptr || !dbo->ok(true)) {
// NOTE: 'output_region' owns 'dbo' pointer at this time
Ioss::Region output_region(dbo, "region_2");
// Set the qa information...
output_region.property_add(Ioss::Property(std::string("code_name"), qainfo[0]));
output_region.property_add(Ioss::Property(std::string("code_version"), qainfo[2]));
Ioss::MeshCopyOptions options{};
options.ints_64_bit = sizeof(INT) == 64;
options.delete_timesteps = true;
options.data_storage_type = 2;
options.verbose = true;
// Copy mesh portion of input region to the output region
Ioss::copy_database(region, output_region, options);
// KLUGE: The metadata has already been written on
// output_region, but we couldn't define the maps until now, so
// need to update the metadata with map information and hope
// that no other maps exist on the database...
if (interFace.outputDecompMap_) {
bool line_decomp = interFace.lineDecomp_;
add_decomp_map(output_region, interFace.decomposition_variable(), line_decomp);
output_decomp_map(output_region, elem_to_proc, element_chains,
interFace.decomposition_variable(), line_decomp);
if (interFace.outputDecompField_) {
bool line_decomp = interFace.lineDecomp_;
add_decomp_field(output_region, interFace.decomposition_variable(), line_decomp);
output_decomp_field(output_region, elem_to_proc, element_chains,
interFace.decomposition_variable(), line_decomp);
bool close_files = interFace.processor_count() + 1 > interFace.max_files();
for (size_t i = 0; i < interFace.processor_count(); i++) {
std::string outfile = Ioss::Utils::decode_filename(nemfile, i, interFace.processor_count());
Ioss::DatabaseIO *dbo = Ioss::IOFactory::create(
"exodus", outfile, Ioss::WRITE_RESTART, Ioss::ParallelUtils::comm_world(), properties);
if (ints64) {
proc_region[i] = new Ioss::Region(dbo);
if (close_files) {
start = seacas_timer();
// Build the proc_elem_block_cnt[i][j] vector.
// Gives number of elements in block i on processor j
size_t block_count = region.get_property("element_block_count").get_int();
std::vector<std::vector<INT>> proc_elem_block_cnt(block_count + 1);
for (auto &pebc : proc_elem_block_cnt) {
get_proc_elem_block_count(region, elem_to_proc, proc_elem_block_cnt);
end = seacas_timer();
fmt::print(stderr, "Calculate elements per element block on each processor = {:.5}\n",
end - start);
// Create element blocks for each processor...
for (size_t p = 0; p < interFace.processor_count(); p++) {
const auto &ebs = region.get_element_blocks();
size_t bc = ebs.size();
for (size_t b = 0; b < bc; b++) {
std::string type = ebs[b]->topology()->name();
auto *eb = new Ioss::ElementBlock(proc_region[p]->get_database(), ebs[b]->name(), type,
// Now that we have the elements on each processor and the element
// blocks those elements are in, can generate the node to proc list...
start = seacas_timer();
std::vector<int> node_to_proc;
std::vector<INT> node_to_proc_pointer;
get_node_to_proc(region, proc_region, elem_to_proc, node_to_proc, node_to_proc_pointer);
end = seacas_timer();
fmt::print(stderr, "Node Categorization Time = {:.5}\n", end - start);
// Communication map data -- interior/border nodes
start = seacas_timer();
define_communication_data(region, proc_region, node_to_proc, node_to_proc_pointer);
end = seacas_timer();
fmt::print(stderr, "Communication Data Definitions = {:.5}\n", end - start);
// Determine nodeset distribution to processor regions.
start = seacas_timer();
get_nodesets(region, proc_region, node_to_proc, node_to_proc_pointer);
end = seacas_timer();
fmt::print(stderr, "Get nodeset data = {:.5}\n", end - start);
start = seacas_timer();
get_sidesets(region, proc_region, elem_to_proc, (INT)0);
end = seacas_timer();
fmt::print(stderr, "Get sideset data = {:.5}\n", end - start);
start = seacas_timer();
double start_comb = start;
fmt::print(stderr, "Begin writing output files\n");
size_t proc_count = interFace.processor_count();
// Output in processor chunks of size <= max_files so can keep all files open....
size_t max_files = interFace.max_files();
size_t chunks = (proc_count + max_files - 1) / max_files;
size_t size_per_chunk = (proc_count + chunks - 1) / chunks;
if (chunks > 1) {
"\nMax open files = {}; processing files in {} chunks of size {} to maximize "
max_files, chunks, size_per_chunk);
for (size_t chunk = 0; chunk < chunks; chunk++) {
size_t proc_begin = chunk * size_per_chunk;
size_t proc_size = size_per_chunk;
if (proc_begin + proc_size > proc_count) {
proc_size = proc_count - proc_begin;
fmt::print(stderr, "\nProcessor range {} to {}\n", fmt::group_digits(proc_begin),
fmt::group_digits(proc_begin + proc_size - 1));
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
Ioss::transfer_coordinate_frames(region, *proc_region[p]);
Ioss::transfer_assemblies(region, *proc_region[p], Ioss::MeshCopyOptions{}, 0);
if (interFace.lineDecomp_) {
proc_progress(p, proc_count);
end = seacas_timer();
fmt::print(stderr, "\tDefine output databases = {:.5}\n", end - start);
// Generate and output node map...
#if 1
start = seacas_timer();
output_node_map(region, proc_region, node_to_proc, node_to_proc_pointer, proc_begin,
end = seacas_timer();
fmt::print(stderr, "\tNode Map Output = {:.5}\n", end - start);
start = seacas_timer();
output_global_node_map(region, proc_region, node_to_proc, node_to_proc_pointer, proc_begin,
end = seacas_timer();
fmt::print(stderr, "\tGlobal Node Map Output = {:.5}\n", end - start);
start = seacas_timer();
output_element_map(region, proc_region, elem_to_proc, proc_begin, proc_size, (INT)1);
end = seacas_timer();
fmt::print(stderr, "\tElement Map Output = {:.5}\n", end - start);
start = seacas_timer();
output_communication_map(region, proc_region, node_to_proc, node_to_proc_pointer, proc_begin,
end = seacas_timer();
fmt::print(stderr, "\tCommunication map Output = {:.5}\n", end - start);
output_connectivity(region, proc_region, elem_to_proc, proc_begin, proc_size, (INT)1);
end = seacas_timer();
fmt::print(stderr, "Connectivity Output = {:.5}\n", end - start);
start = seacas_timer();
#if 0
output_coordinates(region, proc_region, node_to_proc, node_to_proc_pointer, proc_begin,
output_coordinates_c(region, proc_region, node_to_proc, node_to_proc_pointer, proc_begin,
end = seacas_timer();
fmt::print(stderr, "\tCoordinates Output = {:.5}\n", end - start);
start = seacas_timer();
output_nodesets(region, proc_region, node_to_proc, node_to_proc_pointer, proc_begin,
end = seacas_timer();
fmt::print(stderr, "\tNodeset Output = {:.5}\n", end - start);
start = seacas_timer();
output_sidesets(region, proc_region, elem_to_proc, proc_begin, proc_size, (INT)0);
end = seacas_timer();
fmt::print(stderr, "\tSideset Output = {:.5}\n", end - start);
if (interFace.lineDecomp_) {
output_chain_maps(proc_region, element_chains, elem_to_proc, proc_begin, proc_size, (INT)0);
// Close all files...
start = seacas_timer();
for (size_t p = proc_begin; p < proc_begin + proc_size; p++) {
delete proc_region[p];
end = seacas_timer();
fmt::print(stderr, "\tClose and finalize processor {} to {} output databases = {:.5}\n",
proc_begin, proc_begin + proc_size - 1, end - start);
end = seacas_timer();
fmt::print(stderr, "\nTotal time to write output files = {:.5} ({:.5} per file)\n",
end - start_comb, (end - start_comb) / interFace.processor_count());
void filename_substitution(std::string &filename, const SystemInterface &interFace)
// See if filename contains "%P" which is replaced by the number of processors...
// Assumes that %P only occurs once...
// filename is changed.
size_t pos = filename.find("%P");
if (pos != std::string::npos) {
// Found the characters... Replace with the processor count...
size_t num_proc = interFace.processor_count();
std::string tmp(filename, 0, pos);
tmp += std::to_string(num_proc);
tmp += filename.substr(pos + 2);
filename = tmp;
// If contains %M, replace with the decomposition method.
pos = filename.find("%M");
if (pos != std::string::npos) {
// Found the characters... Replace with the input file basename...
const std::string &method_name = interFace.decomposition_method();
std::string tmp(filename, 0, pos);
tmp += method_name;
tmp += filename.substr(pos + 2);
filename = tmp;
} // namespace