Ovidiu Gheorghies
2008-06-16 08:35:02 UTC
Hello all,
I have a simple MPI program (listed at the end of this message) which
is run with 2 processes (1 master, 1 slave) in which the master
repeatedly sends by MPI_Send a 13-byte string buffer to one slave,
which receives it by MPI_Recv.
When the master/slave are configured to send/receive any number of
messages below 250,000, the program runs normally and finishes in 3-4
seconds. However, when I set it to send/receive 300,000 messages, at
some point the application blocks, and `top' reports that the each
process consumes 100% of one core and uses ~22% of the 2GB memory (I
work on an Intel Quad-Core). Both master/slave processes run on the
same computer.
If in the slave I add a line that prints the index of the current
message, then mpirun stops and the following is printed:
[ ... more indices here, from 0 to 119033 ]
119034 119035 119036 119037 Signal:11 info.si_errno:0(Success) si_code:
1(SEGV_MAPERR)
Failing at addr:0x4
[0] func:/usr/lib64/openmpi/libopal.so.0 [0x3e25a21263]
[1] func:/lib64/libpthread.so.0 [0x3fe840de00]
[2] func:/usr/lib64/openmpi/openmpi/
mca_btl_sm.so(mca_btl_sm_component_progress+0x737) [0x2aaaae7b39a7]
[3] func:/usr/lib64/openmpi/openmpi/mca_bml_r2.so(mca_bml_r2_progress
+0x21) [0x2aaaae3a8e41]
[4] func:/usr/lib64/openmpi/libopal.so.0(opal_progress+0x4a)
[0x3e25a0d95a]
[5] func:/usr/lib64/openmpi/openmpi/mca_pml_ob1.so(mca_pml_ob1_recv
+0x258) [0x2aaaae19c9e8]
[6] func:/usr/lib64/openmpi/libmpi.so.0(MPI_Recv+0x139) [0x3e26282a89]
My setup is as follows:
Intel Quad Core, 2BG RAM
Fedora Core 7 64-bit
openmpi Arch: x86_64 Version: 1.1 Release: 8.fc7
The listing of my program is given below:
#include "mpi.h"
#include <time.h>
#include <stdio.h>
#include <string.h>
#define TOTAL_COUNT 300000
void do_master(int rank);
void do_slave(int rank);
int main(int argc, char**argv)
{
int numtasks, rank;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
fprintf(stderr, "test_mpi, last compiled: %s %s\n", __DATE__,
__TIME__);
fprintf(stderr, "Send/receive %d messages...\n", TOTAL_COUNT);
do_master(rank);
}
else if (rank == 1) {
do_slave(rank);
}
fprintf(stderr, "--- FINALIZING rank %d...\n", rank);
MPI_Finalize();
fprintf(stderr, "--- FINALIZED rank %d.\n", rank);
return 1;
}
void do_master(int rank)
{
int tag = 1;
char* buffer="ABCxyzABCxyz";
int size = strlen(buffer) + 1;
int count = 0;
time_t t1 = time(0);
while (count++ < TOTAL_COUNT) {
MPI_Send(buffer, size, MPI_CHAR, 1, tag, MPI_COMM_WORLD);
}
time_t t2 = time(0);
printf("Delta time: %d\n", (int)(t2-t1));
}
void do_slave(int rank)
{
static MPI_Status Stat;
int tag = 1;
char buffer[255];
int count = 0;
while (count++ < TOTAL_COUNT) {
// printf("%d ", count); fflush(stdout); // PRINT-COUNT
MPI_Recv(buffer, 13, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &Stat);
}
}
The program is compiled with:
$ mpicc test_mpi.c -o test_mpi
and executed with:
$ mpirun -np 2 test_mpi
I would appreciate any hints.
Thanks,
Ovidiu
I have a simple MPI program (listed at the end of this message) which
is run with 2 processes (1 master, 1 slave) in which the master
repeatedly sends by MPI_Send a 13-byte string buffer to one slave,
which receives it by MPI_Recv.
When the master/slave are configured to send/receive any number of
messages below 250,000, the program runs normally and finishes in 3-4
seconds. However, when I set it to send/receive 300,000 messages, at
some point the application blocks, and `top' reports that the each
process consumes 100% of one core and uses ~22% of the 2GB memory (I
work on an Intel Quad-Core). Both master/slave processes run on the
same computer.
If in the slave I add a line that prints the index of the current
message, then mpirun stops and the following is printed:
[ ... more indices here, from 0 to 119033 ]
119034 119035 119036 119037 Signal:11 info.si_errno:0(Success) si_code:
1(SEGV_MAPERR)
Failing at addr:0x4
[0] func:/usr/lib64/openmpi/libopal.so.0 [0x3e25a21263]
[1] func:/lib64/libpthread.so.0 [0x3fe840de00]
[2] func:/usr/lib64/openmpi/openmpi/
mca_btl_sm.so(mca_btl_sm_component_progress+0x737) [0x2aaaae7b39a7]
[3] func:/usr/lib64/openmpi/openmpi/mca_bml_r2.so(mca_bml_r2_progress
+0x21) [0x2aaaae3a8e41]
[4] func:/usr/lib64/openmpi/libopal.so.0(opal_progress+0x4a)
[0x3e25a0d95a]
[5] func:/usr/lib64/openmpi/openmpi/mca_pml_ob1.so(mca_pml_ob1_recv
+0x258) [0x2aaaae19c9e8]
[6] func:/usr/lib64/openmpi/libmpi.so.0(MPI_Recv+0x139) [0x3e26282a89]
My setup is as follows:
Intel Quad Core, 2BG RAM
Fedora Core 7 64-bit
openmpi Arch: x86_64 Version: 1.1 Release: 8.fc7
The listing of my program is given below:
#include "mpi.h"
#include <time.h>
#include <stdio.h>
#include <string.h>
#define TOTAL_COUNT 300000
void do_master(int rank);
void do_slave(int rank);
int main(int argc, char**argv)
{
int numtasks, rank;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
fprintf(stderr, "test_mpi, last compiled: %s %s\n", __DATE__,
__TIME__);
fprintf(stderr, "Send/receive %d messages...\n", TOTAL_COUNT);
do_master(rank);
}
else if (rank == 1) {
do_slave(rank);
}
fprintf(stderr, "--- FINALIZING rank %d...\n", rank);
MPI_Finalize();
fprintf(stderr, "--- FINALIZED rank %d.\n", rank);
return 1;
}
void do_master(int rank)
{
int tag = 1;
char* buffer="ABCxyzABCxyz";
int size = strlen(buffer) + 1;
int count = 0;
time_t t1 = time(0);
while (count++ < TOTAL_COUNT) {
MPI_Send(buffer, size, MPI_CHAR, 1, tag, MPI_COMM_WORLD);
}
time_t t2 = time(0);
printf("Delta time: %d\n", (int)(t2-t1));
}
void do_slave(int rank)
{
static MPI_Status Stat;
int tag = 1;
char buffer[255];
int count = 0;
while (count++ < TOTAL_COUNT) {
// printf("%d ", count); fflush(stdout); // PRINT-COUNT
MPI_Recv(buffer, 13, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &Stat);
}
}
The program is compiled with:
$ mpicc test_mpi.c -o test_mpi
and executed with:
$ mpirun -np 2 test_mpi
I would appreciate any hints.
Thanks,
Ovidiu