I'm performing some simple data validation on a large set of data in Node.js (version v7.5.0, with a matrix of 15849x12771 entries). The entire data set is in memory for now, for performance reasons. Therefore it is critical for me to reduce the amount of memory consumed to a theoretical minimum (each number representing 8 bytes in JS).
Please pare the following ways of achieving the same thing.
with forEach
regressData.forEach((yxa, yxaIndex) => {
yxa.forEach((yx, yxIndex) => {
if (!_.isFinite(yx)) {
throw new Error(`non-finite entry at [${yxaIndex}, ${yxIndex}]`);
}
});
});
This consumes all of my node process' memory at 4GB+, causing it to never (until my patience runs out anyway) finish the loop (I guess it will use slower swap memory).
And then the identical version with a typical for
:
for (var yxai = 0, yxal = regressData.length; yxai < yxal; yxai++) {
const yx = regressData[yxai];
for (var yxi = 0, yxl = yx.length; yxi < yxl; yxi++) {
if (!_.isFinite(yx[yxi])) {
throw new Error(`non-finite entry at [${yxai}, ${yxi}]`);
}
}
}
This consumes virtually no extra memory, causing the validation to be done in less than a second.
Is this behavior as expected? I had anticipated that because the forEach
s have closed scopes there would be no issues of additional memory usage when pared to a traditional for
loop.
EDIT: standalone test
node --expose-gc test_foreach.js
if (!gc) throw new Error('please run node like node --expose-gc test_foreach.js');
const _ = require('lodash');
// prepare data to work with
const x = 15849;
const y = 12771;
let regressData = new Array(x);
for (var i = 0; i < x; i++) {
regressData[i] = new Array(y);
for (var j = 0; j < y; j++) {
regressData[i][j] = _.random(true);
}
}
// for loop
gc();
const mb_pre_for = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
console.log(`memory consumption before for loop ${mb_pre_for} megabyte`);
validateFor(regressData);
gc();
const mb_post_for = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
const mb_for = _.round(mb_post_for - mb_pre_for, 2);
console.log(`memory consumption by for loop ${mb_for} megabyte`);
// for each loop
gc();
const mb_pre_foreach = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
console.log(`memory consumption before foreach loop ${mb_pre_foreach} megabyte`);
validateForEach(regressData);
gc();
const mb_post_foreach = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
const mb_foreach = _.round(mb_post_foreach - mb_pre_foreach, 2);
console.log(`memory consumption by foreach loop ${mb_foreach} megabyte`);
function validateFor(regressData) {
for (var yxai = 0, yxal = regressData.length; yxai < yxal; yxai++) {
const yx = regressData[yxai];
for (var yxi = 0, yxl = yx.length; yxi < yxl; yxi++) {
if (!_.isFinite(yx[yxi])) {
throw new Error(`non-finite entry at [${yxai}, ${yxi}]`);
}
}
}
};
function validateForEach(regressData) {
regressData.forEach((yxa, yxaIndex) => {
yxa.forEach((yx, yxIndex) => {
if (!_.isFinite(yx)) {
throw new Error(`non-finite entry at [${yxaIndex}, ${yxIndex}]`);
}
});
});
};
Output:
toms-mbp-2:mem_test tommedema$ node --expose-gc test_foreach.js
memory consumption before for loop 1549.31 megabyte
memory consumption by for loop 0.31 megabyte
memory consumption before foreach loop 1549.66 megabyte
memory consumption by foreach loop 3087.9 megabyte
I'm performing some simple data validation on a large set of data in Node.js (version v7.5.0, with a matrix of 15849x12771 entries). The entire data set is in memory for now, for performance reasons. Therefore it is critical for me to reduce the amount of memory consumed to a theoretical minimum (each number representing 8 bytes in JS).
Please pare the following ways of achieving the same thing.
with forEach
regressData.forEach((yxa, yxaIndex) => {
yxa.forEach((yx, yxIndex) => {
if (!_.isFinite(yx)) {
throw new Error(`non-finite entry at [${yxaIndex}, ${yxIndex}]`);
}
});
});
This consumes all of my node process' memory at 4GB+, causing it to never (until my patience runs out anyway) finish the loop (I guess it will use slower swap memory).
And then the identical version with a typical for
:
for (var yxai = 0, yxal = regressData.length; yxai < yxal; yxai++) {
const yx = regressData[yxai];
for (var yxi = 0, yxl = yx.length; yxi < yxl; yxi++) {
if (!_.isFinite(yx[yxi])) {
throw new Error(`non-finite entry at [${yxai}, ${yxi}]`);
}
}
}
This consumes virtually no extra memory, causing the validation to be done in less than a second.
Is this behavior as expected? I had anticipated that because the forEach
s have closed scopes there would be no issues of additional memory usage when pared to a traditional for
loop.
EDIT: standalone test
node --expose-gc test_foreach.js
if (!gc) throw new Error('please run node like node --expose-gc test_foreach.js');
const _ = require('lodash');
// prepare data to work with
const x = 15849;
const y = 12771;
let regressData = new Array(x);
for (var i = 0; i < x; i++) {
regressData[i] = new Array(y);
for (var j = 0; j < y; j++) {
regressData[i][j] = _.random(true);
}
}
// for loop
gc();
const mb_pre_for = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
console.log(`memory consumption before for loop ${mb_pre_for} megabyte`);
validateFor(regressData);
gc();
const mb_post_for = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
const mb_for = _.round(mb_post_for - mb_pre_for, 2);
console.log(`memory consumption by for loop ${mb_for} megabyte`);
// for each loop
gc();
const mb_pre_foreach = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
console.log(`memory consumption before foreach loop ${mb_pre_foreach} megabyte`);
validateForEach(regressData);
gc();
const mb_post_foreach = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
const mb_foreach = _.round(mb_post_foreach - mb_pre_foreach, 2);
console.log(`memory consumption by foreach loop ${mb_foreach} megabyte`);
function validateFor(regressData) {
for (var yxai = 0, yxal = regressData.length; yxai < yxal; yxai++) {
const yx = regressData[yxai];
for (var yxi = 0, yxl = yx.length; yxi < yxl; yxi++) {
if (!_.isFinite(yx[yxi])) {
throw new Error(`non-finite entry at [${yxai}, ${yxi}]`);
}
}
}
};
function validateForEach(regressData) {
regressData.forEach((yxa, yxaIndex) => {
yxa.forEach((yx, yxIndex) => {
if (!_.isFinite(yx)) {
throw new Error(`non-finite entry at [${yxaIndex}, ${yxIndex}]`);
}
});
});
};
Output:
toms-mbp-2:mem_test tommedema$ node --expose-gc test_foreach.js
memory consumption before for loop 1549.31 megabyte
memory consumption by for loop 0.31 megabyte
memory consumption before foreach loop 1549.66 megabyte
memory consumption by foreach loop 3087.9 megabyte
Share
edited Feb 26, 2017 at 15:37
Tom
asked Feb 26, 2017 at 11:35
TomTom
8,13735 gold badges140 silver badges237 bronze badges
6
-
In your first example, you reference
yxaIndex
in the inner loop. How does your memory consumption go if you remove that reference in thenew Error
line. – David Thomas Commented Feb 26, 2017 at 11:40 - 1 That seems strange... can you make that reproducable with a predefined dataset and an isFinite (dummy) function, so one could run this standalone? – CFrei Commented Feb 26, 2017 at 11:57
- By "no extra memory", what do you mean in hard numbers? – Bergi Commented Feb 26, 2017 at 12:46
-
@CFrei I have updated the question with a standalone runnable test. @Bergi see the test, the
for
loop uses 0.3 megabyte extra, wheras theforEach
uses several gigabytes. – Tom Commented Feb 26, 2017 at 14:17 -
2
The answers in this question will probably help you. It seems that
for ... in
andObject.keys.forEach
are well known at being memory hogs. I tried even with node v7.6.0 and it still ran out of memory. I suppose V8 may rewrite their implementation of these functions in the future to stop them loading the full arrays into memory, rather than just iterating through them with indexes. – David Thomas Commented Feb 27, 2017 at 0:26
1 Answer
Reset to default 92022 Update: This question, and this answer, are obsolete.
The "new execution pipeline" mentioned in the original answer below has been enabled for several years now.
Original post below (in case you're still running 2017-era Node):
(V8 developer here.) This is an unfortunate consequence of how Array.forEach
is implemented in V8's old execution pipeline (full codegen + Crankshaft). In short, what happens is that under some circumstances, using forEach
on an array changes the internal representation of that array to a much less memory efficient format. (Specifically: if the array contained only double values before, and forEach
has also been used on arrays with elements of other types but not too many different kinds of objects, and the code runs hot enough to get optimized. It's fairly plicated ;-) )
With the new execution pipeline (currently behind the --future
flag, will be turned on by default soon), I'm no longer seeing this additional memory consumption.
(That said, classic for
loops do tend to have a small performance advantage over forEach
, just because there's less going on under the hood (per ES spec). In many real workloads, the difference is too small to matter, but in microbenchmarks it's often visible. We might be able to optimize away more of forEach
's overhead in the future, but in cases where you know that every CPU cycle matters, I remend using plain old for (var i = 0; i < array.length; i++)
loops.)